From ec9964b4803300fb86f8e8fd9b421e59f7a71dc5 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 13 May 2019 09:56:34 +0200 Subject: Platform: OLPC: Move EC-specific functionality out from x86 Move the olpc-ec driver away from the X86 OLPC platform so that it could be used by the ARM based laptops too. Notably, the driver for the OLPC battery, which is also used on the ARM models, builds on this driver's interface. It is actually plaform independent: the OLPC EC commands with their argument and responses are mostly the same despite the delivery mechanism is different. Signed-off-by: Lubomir Rintel Acked-by: Pavel Machek Signed-off-by: Andy Shevchenko --- arch/x86/include/asm/olpc.h | 31 ----------- arch/x86/platform/olpc/olpc.c | 119 +++++++----------------------------------- 2 files changed, 18 insertions(+), 132 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index c2bf1de5d901..6fe76282aceb 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -9,12 +9,10 @@ struct olpc_platform_t { int flags; uint32_t boardrev; - int ecver; }; #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 -#define OLPC_F_EC_WIDE_SCI 0x04 #ifdef CONFIG_OLPC @@ -64,13 +62,6 @@ static inline int olpc_board_at_least(uint32_t rev) return olpc_platform_info.boardrev >= rev; } -extern void olpc_ec_wakeup_set(u16 value); -extern void olpc_ec_wakeup_clear(u16 value); -extern bool olpc_ec_wakeup_available(void); - -extern int olpc_ec_mask_write(u16 bits); -extern int olpc_ec_sci_query(u16 *sci_value); - #else static inline int machine_is_olpc(void) @@ -83,14 +74,6 @@ static inline int olpc_has_dcon(void) return 0; } -static inline void olpc_ec_wakeup_set(u16 value) { } -static inline void olpc_ec_wakeup_clear(u16 value) { } - -static inline bool olpc_ec_wakeup_available(void) -{ - return false; -} - #endif #ifdef CONFIG_OLPC_XO1_PM @@ -101,20 +84,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value); extern int pci_olpc_init(void); -/* SCI source values */ - -#define EC_SCI_SRC_EMPTY 0x00 -#define EC_SCI_SRC_GAME 0x01 -#define EC_SCI_SRC_BATTERY 0x02 -#define EC_SCI_SRC_BATSOC 0x04 -#define EC_SCI_SRC_BATERR 0x08 -#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */ -#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */ -#define EC_SCI_SRC_ACPWR 0x40 -#define EC_SCI_SRC_BATCRIT 0x80 -#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */ -#define EC_SCI_SRC_ALL 0x1FF - /* GPIO assignments */ #define OLPC_GPIO_MIC_AC 1 diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index f0e920fb98ad..c6c62b4f251f 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -30,9 +30,6 @@ struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); -/* EC event mask to be applied during suspend (defining wakeup sources). */ -static u16 ec_wakeup_mask; - /* what the timeout *should* be (in ms) */ #define EC_BASE_TIMEOUT 20 @@ -186,83 +183,6 @@ err: return ret; } -void olpc_ec_wakeup_set(u16 value) -{ - ec_wakeup_mask |= value; -} -EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set); - -void olpc_ec_wakeup_clear(u16 value) -{ - ec_wakeup_mask &= ~value; -} -EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear); - -/* - * Returns true if the compile and runtime configurations allow for EC events - * to wake the system. - */ -bool olpc_ec_wakeup_available(void) -{ - if (!machine_is_olpc()) - return false; - - /* - * XO-1 EC wakeups are available when olpc-xo1-sci driver is - * compiled in - */ -#ifdef CONFIG_OLPC_XO1_SCI - if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */ - return true; -#endif - - /* - * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is - * compiled in - */ -#ifdef CONFIG_OLPC_XO15_SCI - if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */ - return true; -#endif - - return false; -} -EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available); - -int olpc_ec_mask_write(u16 bits) -{ - if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) { - __be16 ec_word = cpu_to_be16(bits); - return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2, - NULL, 0); - } else { - unsigned char ec_byte = bits & 0xff; - return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0); - } -} -EXPORT_SYMBOL_GPL(olpc_ec_mask_write); - -int olpc_ec_sci_query(u16 *sci_value) -{ - int ret; - - if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) { - __be16 ec_word; - ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, - NULL, 0, (void *) &ec_word, 2); - if (ret == 0) - *sci_value = be16_to_cpu(ec_word); - } else { - unsigned char ec_byte; - ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1); - if (ret == 0) - *sci_value = ec_byte; - } - - return ret; -} -EXPORT_SYMBOL_GPL(olpc_ec_sci_query); - static bool __init check_ofw_architecture(struct device_node *root) { const char *olpc_arch; @@ -296,6 +216,10 @@ static bool __init platform_detect(void) if (success) { olpc_platform_info.boardrev = get_board_revision(root); olpc_platform_info.flags |= OLPC_F_PRESENT; + + pr_info("OLPC board revision %s%X\n", + ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", + olpc_platform_info.boardrev >> 4); } of_node_put(root); @@ -315,27 +239,8 @@ static int __init add_xo1_platform_devices(void) return PTR_ERR_OR_ZERO(pdev); } -static int olpc_xo1_ec_probe(struct platform_device *pdev) -{ - /* get the EC revision */ - olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, - (unsigned char *) &olpc_platform_info.ecver, 1); - - /* EC version 0x5f adds support for wide SCI mask */ - if (olpc_platform_info.ecver >= 0x5f) - olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI; - - pr_info("OLPC board revision %s%X (EC=%x)\n", - ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", - olpc_platform_info.boardrev >> 4, - olpc_platform_info.ecver); - - return 0; -} static int olpc_xo1_ec_suspend(struct platform_device *pdev) { - olpc_ec_mask_write(ec_wakeup_mask); - /* * Squelch SCIs while suspended. This is a fix for * . @@ -359,15 +264,27 @@ static int olpc_xo1_ec_resume(struct platform_device *pdev) } static struct olpc_ec_driver ec_xo1_driver = { - .probe = olpc_xo1_ec_probe, .suspend = olpc_xo1_ec_suspend, .resume = olpc_xo1_ec_resume, .ec_cmd = olpc_xo1_ec_cmd, +#ifdef CONFIG_OLPC_XO1_SCI + /* + * XO-1 EC wakeups are available when olpc-xo1-sci driver is + * compiled in + */ + .wakeup_available = true, +#endif }; static struct olpc_ec_driver ec_xo1_5_driver = { - .probe = olpc_xo1_ec_probe, .ec_cmd = olpc_xo1_ec_cmd, +#ifdef CONFIG_OLPC_XO1_5_SCI + /* + * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is + * compiled in + */ + .wakeup_available = true, +#endif }; static int __init olpc_init(void) -- cgit v1.2.3-55-g7522 From 0c3d931b3ab9efeea4948b5373c62095449d0101 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 13 May 2019 09:56:37 +0200 Subject: Platform: OLPC: Add XO-1.75 EC driver It's based off the driver from the OLPC kernel sources. Somewhat modernized and cleaned up, for better or worse. Modified to plug into the olpc-ec driver infrastructure (so that battery interface and debugfs could be reused) and the SPI slave framework. Signed-off-by: Lubomir Rintel Reviewed-by: Andy Shevchenko Signed-off-by: Andy Shevchenko --- arch/x86/Kconfig | 1 + drivers/platform/Kconfig | 2 + drivers/platform/Makefile | 2 +- drivers/platform/olpc/Kconfig | 14 + drivers/platform/olpc/Makefile | 3 +- drivers/platform/olpc/olpc-xo175-ec.c | 752 ++++++++++++++++++++++++++++++++++ include/linux/olpc-ec.h | 4 +- 7 files changed, 774 insertions(+), 4 deletions(-) create mode 100644 drivers/platform/olpc/Kconfig create mode 100644 drivers/platform/olpc/olpc-xo175-ec.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..cb1c073b3c7e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2698,6 +2698,7 @@ config OLPC select OF select OF_PROMTREE select IRQ_DOMAIN + select OLPC_EC ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig index d4c2e424a700..4313d73d3618 100644 --- a/drivers/platform/Kconfig +++ b/drivers/platform/Kconfig @@ -10,3 +10,5 @@ source "drivers/platform/goldfish/Kconfig" source "drivers/platform/chrome/Kconfig" source "drivers/platform/mellanox/Kconfig" + +source "drivers/platform/olpc/Kconfig" diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile index 4b2ce58bcd9c..6fda58c021ca 100644 --- a/drivers/platform/Makefile +++ b/drivers/platform/Makefile @@ -6,6 +6,6 @@ obj-$(CONFIG_X86) += x86/ obj-$(CONFIG_MELLANOX_PLATFORM) += mellanox/ obj-$(CONFIG_MIPS) += mips/ -obj-$(CONFIG_OLPC) += olpc/ +obj-$(CONFIG_OLPC_EC) += olpc/ obj-$(CONFIG_GOLDFISH) += goldfish/ obj-$(CONFIG_CHROME_PLATFORMS) += chrome/ diff --git a/drivers/platform/olpc/Kconfig b/drivers/platform/olpc/Kconfig new file mode 100644 index 000000000000..559f843199d7 --- /dev/null +++ b/drivers/platform/olpc/Kconfig @@ -0,0 +1,14 @@ +config OLPC_EC + bool + +config OLPC_XO175_EC + tristate "OLPC XO 1.75 Embedded Controller" + depends on ARCH_MMP || COMPILE_TEST + select SPI_SLAVE + select OLPC_EC + help + Include support for the OLPC XO Embedded Controller (EC). The EC + provides various platform services, including support for the power, + button, restart, shutdown and battery charging status. + + Unless you have an OLPC XO laptop, you will want to say N. diff --git a/drivers/platform/olpc/Makefile b/drivers/platform/olpc/Makefile index dc8b26bc7209..01fe6ba01665 100644 --- a/drivers/platform/olpc/Makefile +++ b/drivers/platform/olpc/Makefile @@ -1,4 +1,5 @@ # # OLPC XO platform-specific drivers # -obj-$(CONFIG_OLPC) += olpc-ec.o +obj-$(CONFIG_OLPC_EC) += olpc-ec.o +obj-$(CONFIG_OLPC_XO175_EC) += olpc-xo175-ec.o diff --git a/drivers/platform/olpc/olpc-xo175-ec.c b/drivers/platform/olpc/olpc-xo175-ec.c new file mode 100644 index 000000000000..344d14f3da54 --- /dev/null +++ b/drivers/platform/olpc/olpc-xo175-ec.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for the OLPC XO-1.75 Embedded Controller. + * + * The EC protocol is documented at: + * http://wiki.laptop.org/go/XO_1.75_HOST_to_EC_Protocol + * + * Copyright (C) 2010 One Laptop per Child Foundation. + * Copyright (C) 2018 Lubomir Rintel + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ec_cmd_t { + u8 cmd; + u8 bytes_returned; +}; + +enum ec_chan_t { + CHAN_NONE = 0, + CHAN_SWITCH, + CHAN_CMD_RESP, + CHAN_KEYBOARD, + CHAN_TOUCHPAD, + CHAN_EVENT, + CHAN_DEBUG, + CHAN_CMD_ERROR, +}; + +/* + * EC events + */ +#define EVENT_AC_CHANGE 1 /* AC plugged/unplugged */ +#define EVENT_BATTERY_STATUS 2 /* Battery low/full/error/gone */ +#define EVENT_BATTERY_CRITICAL 3 /* Battery critical voltage */ +#define EVENT_BATTERY_SOC_CHANGE 4 /* 1% SOC Change */ +#define EVENT_BATTERY_ERROR 5 /* Abnormal error, query for cause */ +#define EVENT_POWER_PRESSED 6 /* Power button was pressed */ +#define EVENT_POWER_PRESS_WAKE 7 /* Woken up with a power button */ +#define EVENT_TIMED_HOST_WAKE 8 /* Host wake timer */ +#define EVENT_OLS_HIGH_LIMIT 9 /* OLS crossed dark threshold */ +#define EVENT_OLS_LOW_LIMIT 10 /* OLS crossed light threshold */ + +/* + * EC commands + * (from http://dev.laptop.org/git/users/rsmith/ec-1.75/tree/ec_cmd.h) + */ +#define CMD_GET_API_VERSION 0x08 /* out: u8 */ +#define CMD_READ_VOLTAGE 0x10 /* out: u16, *9.76/32, mV */ +#define CMD_READ_CURRENT 0x11 /* out: s16, *15.625/120, mA */ +#define CMD_READ_ACR 0x12 /* out: s16, *6250/15, uAh */ +#define CMD_READ_BATT_TEMPERATURE 0x13 /* out: u16, *100/256, deg C */ +#define CMD_READ_AMBIENT_TEMPERATURE 0x14 /* unimplemented, no hardware */ +#define CMD_READ_BATTERY_STATUS 0x15 /* out: u8, bitmask */ +#define CMD_READ_SOC 0x16 /* out: u8, percentage */ +#define CMD_READ_GAUGE_ID 0x17 /* out: u8 * 8 */ +#define CMD_READ_GAUGE_DATA 0x18 /* in: u8 addr, out: u8 data */ +#define CMD_READ_BOARD_ID 0x19 /* out: u16 (platform id) */ +#define CMD_READ_BATT_ERR_CODE 0x1f /* out: u8, error bitmask */ +#define CMD_SET_DCON_POWER 0x26 /* in: u8 */ +#define CMD_RESET_EC 0x28 /* none */ +#define CMD_READ_BATTERY_TYPE 0x2c /* out: u8 */ +#define CMD_SET_AUTOWAK 0x33 /* out: u8 */ +#define CMD_SET_EC_WAKEUP_TIMER 0x36 /* in: u32, out: ? */ +#define CMD_READ_EXT_SCI_MASK 0x37 /* ? */ +#define CMD_WRITE_EXT_SCI_MASK 0x38 /* ? */ +#define CMD_CLEAR_EC_WAKEUP_TIMER 0x39 /* none */ +#define CMD_ENABLE_RUNIN_DISCHARGE 0x3B /* none */ +#define CMD_DISABLE_RUNIN_DISCHARGE 0x3C /* none */ +#define CMD_READ_MPPT_ACTIVE 0x3d /* out: u8 */ +#define CMD_READ_MPPT_LIMIT 0x3e /* out: u8 */ +#define CMD_SET_MPPT_LIMIT 0x3f /* in: u8 */ +#define CMD_DISABLE_MPPT 0x40 /* none */ +#define CMD_ENABLE_MPPT 0x41 /* none */ +#define CMD_READ_VIN 0x42 /* out: u16 */ +#define CMD_EXT_SCI_QUERY 0x43 /* ? */ +#define RSP_KEYBOARD_DATA 0x48 /* ? */ +#define RSP_TOUCHPAD_DATA 0x49 /* ? */ +#define CMD_GET_FW_VERSION 0x4a /* out: u8 * 16 */ +#define CMD_POWER_CYCLE 0x4b /* none */ +#define CMD_POWER_OFF 0x4c /* none */ +#define CMD_RESET_EC_SOFT 0x4d /* none */ +#define CMD_READ_GAUGE_U16 0x4e /* ? */ +#define CMD_ENABLE_MOUSE 0x4f /* ? */ +#define CMD_ECHO 0x52 /* in: u8 * 5, out: u8 * 5 */ +#define CMD_GET_FW_DATE 0x53 /* out: u8 * 16 */ +#define CMD_GET_FW_USER 0x54 /* out: u8 * 16 */ +#define CMD_TURN_OFF_POWER 0x55 /* none (same as 0x4c) */ +#define CMD_READ_OLS 0x56 /* out: u16 */ +#define CMD_OLS_SMT_LEDON 0x57 /* none */ +#define CMD_OLS_SMT_LEDOFF 0x58 /* none */ +#define CMD_START_OLS_ASSY 0x59 /* none */ +#define CMD_STOP_OLS_ASSY 0x5a /* none */ +#define CMD_OLS_SMTTEST_STOP 0x5b /* none */ +#define CMD_READ_VIN_SCALED 0x5c /* out: u16 */ +#define CMD_READ_BAT_MIN_W 0x5d /* out: u16 */ +#define CMD_READ_BAR_MAX_W 0x5e /* out: u16 */ +#define CMD_RESET_BAT_MINMAX_W 0x5f /* none */ +#define CMD_READ_LOCATION 0x60 /* in: u16 addr, out: u8 data */ +#define CMD_WRITE_LOCATION 0x61 /* in: u16 addr, u8 data */ +#define CMD_KEYBOARD_CMD 0x62 /* in: u8, out: ? */ +#define CMD_TOUCHPAD_CMD 0x63 /* in: u8, out: ? */ +#define CMD_GET_FW_HASH 0x64 /* out: u8 * 16 */ +#define CMD_SUSPEND_HINT 0x65 /* in: u8 */ +#define CMD_ENABLE_WAKE_TIMER 0x66 /* in: u8 */ +#define CMD_SET_WAKE_TIMER 0x67 /* in: 32 */ +#define CMD_ENABLE_WAKE_AUTORESET 0x68 /* in: u8 */ +#define CMD_OLS_SET_LIMITS 0x69 /* in: u16, u16 */ +#define CMD_OLS_GET_LIMITS 0x6a /* out: u16, u16 */ +#define CMD_OLS_SET_CEILING 0x6b /* in: u16 */ +#define CMD_OLS_GET_CEILING 0x6c /* out: u16 */ + +/* + * Accepted EC commands, and how many bytes they return. There are plenty + * of EC commands that are no longer implemented, or are implemented only on + * certain older boards. + */ +static const struct ec_cmd_t olpc_xo175_ec_cmds[] = { + { CMD_GET_API_VERSION, 1 }, + { CMD_READ_VOLTAGE, 2 }, + { CMD_READ_CURRENT, 2 }, + { CMD_READ_ACR, 2 }, + { CMD_READ_BATT_TEMPERATURE, 2 }, + { CMD_READ_BATTERY_STATUS, 1 }, + { CMD_READ_SOC, 1 }, + { CMD_READ_GAUGE_ID, 8 }, + { CMD_READ_GAUGE_DATA, 1 }, + { CMD_READ_BOARD_ID, 2 }, + { CMD_READ_BATT_ERR_CODE, 1 }, + { CMD_SET_DCON_POWER, 0 }, + { CMD_RESET_EC, 0 }, + { CMD_READ_BATTERY_TYPE, 1 }, + { CMD_ENABLE_RUNIN_DISCHARGE, 0 }, + { CMD_DISABLE_RUNIN_DISCHARGE, 0 }, + { CMD_READ_MPPT_ACTIVE, 1 }, + { CMD_READ_MPPT_LIMIT, 1 }, + { CMD_SET_MPPT_LIMIT, 0 }, + { CMD_DISABLE_MPPT, 0 }, + { CMD_ENABLE_MPPT, 0 }, + { CMD_READ_VIN, 2 }, + { CMD_GET_FW_VERSION, 16 }, + { CMD_POWER_CYCLE, 0 }, + { CMD_POWER_OFF, 0 }, + { CMD_RESET_EC_SOFT, 0 }, + { CMD_ECHO, 5 }, + { CMD_GET_FW_DATE, 16 }, + { CMD_GET_FW_USER, 16 }, + { CMD_TURN_OFF_POWER, 0 }, + { CMD_READ_OLS, 2 }, + { CMD_OLS_SMT_LEDON, 0 }, + { CMD_OLS_SMT_LEDOFF, 0 }, + { CMD_START_OLS_ASSY, 0 }, + { CMD_STOP_OLS_ASSY, 0 }, + { CMD_OLS_SMTTEST_STOP, 0 }, + { CMD_READ_VIN_SCALED, 2 }, + { CMD_READ_BAT_MIN_W, 2 }, + { CMD_READ_BAR_MAX_W, 2 }, + { CMD_RESET_BAT_MINMAX_W, 0 }, + { CMD_READ_LOCATION, 1 }, + { CMD_WRITE_LOCATION, 0 }, + { CMD_GET_FW_HASH, 16 }, + { CMD_SUSPEND_HINT, 0 }, + { CMD_ENABLE_WAKE_TIMER, 0 }, + { CMD_SET_WAKE_TIMER, 0 }, + { CMD_ENABLE_WAKE_AUTORESET, 0 }, + { CMD_OLS_SET_LIMITS, 0 }, + { CMD_OLS_GET_LIMITS, 4 }, + { CMD_OLS_SET_CEILING, 0 }, + { CMD_OLS_GET_CEILING, 2 }, + { CMD_READ_EXT_SCI_MASK, 2 }, + { CMD_WRITE_EXT_SCI_MASK, 0 }, + + { } +}; + +#define EC_MAX_CMD_DATA_LEN 5 +#define EC_MAX_RESP_LEN 16 + +#define LOG_BUF_SIZE 128 + +#define PM_WAKEUP_TIME 1000 + +#define EC_ALL_EVENTS GENMASK(15, 0) + +enum ec_state_t { + CMD_STATE_IDLE = 0, + CMD_STATE_WAITING_FOR_SWITCH, + CMD_STATE_CMD_IN_TX_FIFO, + CMD_STATE_CMD_SENT, + CMD_STATE_RESP_RECEIVED, + CMD_STATE_ERROR_RECEIVED, +}; + +struct olpc_xo175_ec_cmd { + u8 command; + u8 nr_args; + u8 data_len; + u8 args[EC_MAX_CMD_DATA_LEN]; +}; + +struct olpc_xo175_ec_resp { + u8 channel; + u8 byte; +}; + +struct olpc_xo175_ec { + bool suspended; + + /* SPI related stuff. */ + struct spi_device *spi; + struct spi_transfer xfer; + struct spi_message msg; + union { + struct olpc_xo175_ec_cmd cmd; + struct olpc_xo175_ec_resp resp; + } tx_buf, rx_buf; + + /* GPIO for the CMD signals. */ + struct gpio_desc *gpio_cmd; + + /* Command handling related state. */ + spinlock_t cmd_state_lock; + int cmd_state; + bool cmd_running; + struct completion cmd_done; + struct olpc_xo175_ec_cmd cmd; + u8 resp_data[EC_MAX_RESP_LEN]; + int expected_resp_len; + int resp_len; + + /* Power button. */ + struct input_dev *pwrbtn; + + /* Debug handling. */ + char logbuf[LOG_BUF_SIZE]; + int logbuf_len; +}; + +static struct platform_device *olpc_ec; + +static int olpc_xo175_ec_resp_len(u8 cmd) +{ + const struct ec_cmd_t *p; + + for (p = olpc_xo175_ec_cmds; p->cmd; p++) { + if (p->cmd == cmd) + return p->bytes_returned; + } + + return -EINVAL; +} + +static void olpc_xo175_ec_flush_logbuf(struct olpc_xo175_ec *priv) +{ + dev_dbg(&priv->spi->dev, "got debug string [%*pE]\n", + priv->logbuf_len, priv->logbuf); + priv->logbuf_len = 0; +} + +static void olpc_xo175_ec_complete(void *arg); + +static void olpc_xo175_ec_send_command(struct olpc_xo175_ec *priv, void *cmd, + size_t cmdlen) +{ + int ret; + + memcpy(&priv->tx_buf, cmd, cmdlen); + priv->xfer.len = cmdlen; + + spi_message_init_with_transfers(&priv->msg, &priv->xfer, 1); + + priv->msg.complete = olpc_xo175_ec_complete; + priv->msg.context = priv; + + ret = spi_async(priv->spi, &priv->msg); + if (ret) + dev_err(&priv->spi->dev, "spi_async() failed %d\n", ret); +} + +static void olpc_xo175_ec_read_packet(struct olpc_xo175_ec *priv) +{ + u8 nonce[] = {0xA5, 0x5A}; + + olpc_xo175_ec_send_command(priv, nonce, sizeof(nonce)); +} + +static void olpc_xo175_ec_complete(void *arg) +{ + struct olpc_xo175_ec *priv = arg; + struct device *dev = &priv->spi->dev; + struct power_supply *psy; + unsigned long flags; + u8 channel; + u8 byte; + int ret; + + ret = priv->msg.status; + if (ret) { + dev_err(dev, "SPI transfer failed: %d\n", ret); + + spin_lock_irqsave(&priv->cmd_state_lock, flags); + if (priv->cmd_running) { + priv->resp_len = 0; + priv->cmd_state = CMD_STATE_ERROR_RECEIVED; + complete(&priv->cmd_done); + } + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + + if (ret != -EINTR) + olpc_xo175_ec_read_packet(priv); + + return; + } + + channel = priv->rx_buf.resp.channel; + byte = priv->rx_buf.resp.byte; + + switch (channel) { + case CHAN_NONE: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + /* We can safely ignore these */ + dev_err(dev, "spurious FIFO read packet\n"); + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + return; + } + + priv->cmd_state = CMD_STATE_CMD_SENT; + if (!priv->expected_resp_len) + complete(&priv->cmd_done); + olpc_xo175_ec_read_packet(priv); + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + return; + + case CHAN_SWITCH: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + /* Just go with the flow */ + dev_err(dev, "spurious SWITCH packet\n"); + memset(&priv->cmd, 0, sizeof(priv->cmd)); + priv->cmd.command = CMD_ECHO; + } + + priv->cmd_state = CMD_STATE_CMD_IN_TX_FIFO; + + /* Throw command into TxFIFO */ + gpiod_set_value_cansleep(priv->gpio_cmd, 0); + olpc_xo175_ec_send_command(priv, &priv->cmd, sizeof(priv->cmd)); + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + return; + + case CHAN_CMD_RESP: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + dev_err(dev, "spurious response packet\n"); + } else if (priv->resp_len >= priv->expected_resp_len) { + dev_err(dev, "too many response packets\n"); + } else { + priv->resp_data[priv->resp_len++] = byte; + if (priv->resp_len == priv->expected_resp_len) { + priv->cmd_state = CMD_STATE_RESP_RECEIVED; + complete(&priv->cmd_done); + } + } + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + break; + + case CHAN_CMD_ERROR: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + dev_err(dev, "spurious cmd error packet\n"); + } else { + priv->resp_data[0] = byte; + priv->resp_len = 1; + priv->cmd_state = CMD_STATE_ERROR_RECEIVED; + complete(&priv->cmd_done); + } + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + break; + + case CHAN_KEYBOARD: + dev_warn(dev, "keyboard is not supported\n"); + break; + + case CHAN_TOUCHPAD: + dev_warn(dev, "touchpad is not supported\n"); + break; + + case CHAN_EVENT: + dev_dbg(dev, "got event %.2x\n", byte); + switch (byte) { + case EVENT_AC_CHANGE: + psy = power_supply_get_by_name("olpc-ac"); + if (psy) { + power_supply_changed(psy); + power_supply_put(psy); + } + break; + case EVENT_BATTERY_STATUS: + case EVENT_BATTERY_CRITICAL: + case EVENT_BATTERY_SOC_CHANGE: + case EVENT_BATTERY_ERROR: + psy = power_supply_get_by_name("olpc-battery"); + if (psy) { + power_supply_changed(psy); + power_supply_put(psy); + } + break; + case EVENT_POWER_PRESSED: + input_report_key(priv->pwrbtn, KEY_POWER, 1); + input_sync(priv->pwrbtn); + input_report_key(priv->pwrbtn, KEY_POWER, 0); + input_sync(priv->pwrbtn); + /* fall through */ + case EVENT_POWER_PRESS_WAKE: + case EVENT_TIMED_HOST_WAKE: + pm_wakeup_event(priv->pwrbtn->dev.parent, + PM_WAKEUP_TIME); + break; + default: + dev_dbg(dev, "ignored unknown event %.2x\n", byte); + break; + } + break; + + case CHAN_DEBUG: + if (byte == '\n') { + olpc_xo175_ec_flush_logbuf(priv); + } else if (isprint(byte)) { + priv->logbuf[priv->logbuf_len++] = byte; + if (priv->logbuf_len == LOG_BUF_SIZE) + olpc_xo175_ec_flush_logbuf(priv); + } + break; + + default: + dev_warn(dev, "unknown channel: %d, %.2x\n", channel, byte); + break; + } + + /* Most non-command packets get the TxFIFO refilled and an ACK. */ + olpc_xo175_ec_read_packet(priv); +} + +/* + * This function is protected with a mutex. We can safely assume that + * there will be only one instance of this function running at a time. + * One of the ways in which we enforce this is by waiting until we get + * all response bytes back from the EC, rather than just the number that + * the caller requests (otherwise, we might start a new command while an + * old command's response bytes are still incoming). + */ +static int olpc_xo175_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *resp, + size_t resp_len, void *ec_cb_arg) +{ + struct olpc_xo175_ec *priv = ec_cb_arg; + struct device *dev = &priv->spi->dev; + unsigned long flags; + size_t nr_bytes; + int ret = 0; + + dev_dbg(dev, "CMD %x, %zd bytes expected\n", cmd, resp_len); + + if (inlen > 5) { + dev_err(dev, "command len %zd too big!\n", resp_len); + return -EOVERFLOW; + } + + /* Suspending in the middle of an EC command hoses things badly! */ + if (WARN_ON(priv->suspended)) + return -EBUSY; + + /* Ensure a valid command and return bytes */ + ret = olpc_xo175_ec_resp_len(cmd); + if (ret < 0) { + dev_err_ratelimited(dev, "unknown command 0x%x\n", cmd); + + /* + * Assume the best in our callers, and allow unknown commands + * through. I'm not the charitable type, but it was beaten + * into me. Just maintain a minimum standard of sanity. + */ + if (resp_len > sizeof(priv->resp_data)) { + dev_err(dev, "response too big: %zd!\n", resp_len); + return -EOVERFLOW; + } + nr_bytes = resp_len; + } else { + nr_bytes = (size_t)ret; + } + resp_len = min(resp_len, nr_bytes); + + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + /* Initialize the state machine */ + init_completion(&priv->cmd_done); + priv->cmd_running = true; + priv->cmd_state = CMD_STATE_WAITING_FOR_SWITCH; + memset(&priv->cmd, 0, sizeof(priv->cmd)); + priv->cmd.command = cmd; + priv->cmd.nr_args = inlen; + priv->cmd.data_len = 0; + memcpy(priv->cmd.args, inbuf, inlen); + priv->expected_resp_len = nr_bytes; + priv->resp_len = 0; + + /* Tickle the cmd gpio to get things started */ + gpiod_set_value_cansleep(priv->gpio_cmd, 1); + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + + /* The irq handler should do the rest */ + if (!wait_for_completion_timeout(&priv->cmd_done, + msecs_to_jiffies(4000))) { + dev_err(dev, "EC cmd error: timeout in STATE %d\n", + priv->cmd_state); + gpiod_set_value_cansleep(priv->gpio_cmd, 0); + spi_slave_abort(priv->spi); + olpc_xo175_ec_read_packet(priv); + return -ETIMEDOUT; + } + + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + /* Deal with the results. */ + if (priv->cmd_state == CMD_STATE_ERROR_RECEIVED) { + /* EC-provided error is in the single response byte */ + dev_err(dev, "command 0x%x returned error 0x%x\n", + cmd, priv->resp_data[0]); + ret = -EREMOTEIO; + } else if (priv->resp_len != nr_bytes) { + dev_err(dev, "command 0x%x returned %d bytes, expected %zd bytes\n", + cmd, priv->resp_len, nr_bytes); + ret = -EREMOTEIO; + } else { + /* + * We may have 8 bytes in priv->resp, but we only care about + * what we've been asked for. If the caller asked for only 2 + * bytes, give them that. We've guaranteed that + * resp_len <= priv->resp_len and priv->resp_len == nr_bytes. + */ + memcpy(resp, priv->resp_data, resp_len); + } + + /* This should already be low, but just in case. */ + gpiod_set_value_cansleep(priv->gpio_cmd, 0); + priv->cmd_running = false; + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + + return ret; +} + +static int olpc_xo175_ec_set_event_mask(unsigned int mask) +{ + u8 args[2]; + + args[0] = mask >> 0; + args[1] = mask >> 8; + return olpc_ec_cmd(CMD_WRITE_EXT_SCI_MASK, args, 2, NULL, 0); +} + +static void olpc_xo175_ec_power_off(void) +{ + while (1) { + olpc_ec_cmd(CMD_POWER_OFF, NULL, 0, NULL, 0); + mdelay(1000); + } +} + +static int __maybe_unused olpc_xo175_ec_suspend(struct device *dev) +{ + struct olpc_xo175_ec *priv = dev_get_drvdata(dev); + static struct { + u8 suspend; + u32 suspend_count; + } __packed hintargs; + static unsigned int suspend_count; + + /* + * SOC_SLEEP is not wired to the EC on B3 and earlier boards. + * This command lets the EC know instead. The suspend count doesn't seem + * to be used anywhere but in the EC debug output. + */ + hintargs.suspend = 1; + hintargs.suspend_count = suspend_count++; + olpc_ec_cmd(CMD_SUSPEND_HINT, (void *)&hintargs, sizeof(hintargs), + NULL, 0); + + /* + * After we've sent the suspend hint, don't allow further EC commands + * to be run until we've resumed. Userspace tasks should be frozen, + * but kernel threads and interrupts could still schedule EC commands. + */ + priv->suspended = true; + + return 0; +} + +static int __maybe_unused olpc_xo175_ec_resume_noirq(struct device *dev) +{ + struct olpc_xo175_ec *priv = dev_get_drvdata(dev); + + priv->suspended = false; + + return 0; +} + +static int __maybe_unused olpc_xo175_ec_resume(struct device *dev) +{ + u8 x = 0; + + /* + * The resume hint is only needed if no other commands are + * being sent during resume. all it does is tell the EC + * the SoC is definitely awake. + */ + olpc_ec_cmd(CMD_SUSPEND_HINT, &x, 1, NULL, 0); + + /* Enable all EC events while we're awake */ + olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS); + + return 0; +} + +static struct olpc_ec_driver olpc_xo175_ec_driver = { + .ec_cmd = olpc_xo175_ec_cmd, +}; + +static int olpc_xo175_ec_remove(struct spi_device *spi) +{ + if (pm_power_off == olpc_xo175_ec_power_off) + pm_power_off = NULL; + + spi_slave_abort(spi); + + platform_device_unregister(olpc_ec); + olpc_ec = NULL; + + return 0; +} + +static int olpc_xo175_ec_probe(struct spi_device *spi) +{ + struct olpc_xo175_ec *priv; + int ret; + + if (olpc_ec) { + dev_err(&spi->dev, "OLPC EC already registered.\n"); + return -EBUSY; + } + + priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->gpio_cmd = devm_gpiod_get(&spi->dev, "cmd", GPIOD_OUT_LOW); + if (IS_ERR(priv->gpio_cmd)) { + dev_err(&spi->dev, "failed to get cmd gpio: %ld\n", + PTR_ERR(priv->gpio_cmd)); + return PTR_ERR(priv->gpio_cmd); + } + + priv->spi = spi; + + spin_lock_init(&priv->cmd_state_lock); + priv->cmd_state = CMD_STATE_IDLE; + init_completion(&priv->cmd_done); + + priv->logbuf_len = 0; + + /* Set up power button input device */ + priv->pwrbtn = devm_input_allocate_device(&spi->dev); + if (!priv->pwrbtn) + return -ENOMEM; + priv->pwrbtn->name = "Power Button"; + priv->pwrbtn->dev.parent = &spi->dev; + input_set_capability(priv->pwrbtn, EV_KEY, KEY_POWER); + ret = input_register_device(priv->pwrbtn); + if (ret) { + dev_err(&spi->dev, "error registering input device: %d\n", ret); + return ret; + } + + spi_set_drvdata(spi, priv); + + priv->xfer.rx_buf = &priv->rx_buf; + priv->xfer.tx_buf = &priv->tx_buf; + + olpc_xo175_ec_read_packet(priv); + + olpc_ec_driver_register(&olpc_xo175_ec_driver, priv); + olpc_ec = platform_device_register_resndata(&spi->dev, "olpc-ec", -1, + NULL, 0, NULL, 0); + + /* Enable all EC events while we're awake */ + olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS); + + if (pm_power_off == NULL) + pm_power_off = olpc_xo175_ec_power_off; + + dev_info(&spi->dev, "OLPC XO-1.75 Embedded Controller driver\n"); + + return 0; +} + +static const struct dev_pm_ops olpc_xo175_ec_pm_ops = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, olpc_xo175_ec_resume_noirq) + SET_RUNTIME_PM_OPS(olpc_xo175_ec_suspend, olpc_xo175_ec_resume, NULL) +}; + +static const struct of_device_id olpc_xo175_ec_of_match[] = { + { .compatible = "olpc,xo1.75-ec" }, + { } +}; +MODULE_DEVICE_TABLE(of, olpc_xo175_ec_of_match); + +static struct spi_driver olpc_xo175_ec_spi_driver = { + .driver = { + .name = "olpc-xo175-ec", + .of_match_table = olpc_xo175_ec_of_match, + .pm = &olpc_xo175_ec_pm_ops, + }, + .probe = olpc_xo175_ec_probe, + .remove = olpc_xo175_ec_remove, +}; +module_spi_driver(olpc_xo175_ec_spi_driver); + +MODULE_DESCRIPTION("OLPC XO-1.75 Embedded Controller driver"); +MODULE_AUTHOR("Lennert Buytenhek "); /* Functionality */ +MODULE_AUTHOR("Lubomir Rintel "); /* Bugs */ +MODULE_LICENSE("GPL"); diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h index f7b6a7eda232..c4602364e909 100644 --- a/include/linux/olpc-ec.h +++ b/include/linux/olpc-ec.h @@ -41,7 +41,7 @@ struct olpc_ec_driver { bool wakeup_available; }; -#ifdef CONFIG_OLPC +#ifdef CONFIG_OLPC_EC extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg); @@ -69,6 +69,6 @@ static inline bool olpc_ec_wakeup_available(void) return false; } -#endif /* CONFIG_OLPC */ +#endif /* CONFIG_OLPC_EC */ #endif /* _LINUX_OLPC_EC_H */ -- cgit v1.2.3-55-g7522 From f7a9945184100b531f0de3b12c617a349236dd8a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 May 2019 12:42:58 -0400 Subject: no need to protect against put_user_ns(NULL) it's a no-op Signed-off-by: Al Viro --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 3 +-- fs/sysfs/mount.c | 3 +-- kernel/cgroup/cgroup.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 333c177a2471..68bd609026e6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2108,8 +2108,7 @@ static int rdt_init_fs_context(struct fs_context *fc) ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; fc->fs_private = &ctx->kfc; fc->ops = &rdt_fs_context_ops; - if (fc->user_ns) - put_user_ns(fc->user_ns); + put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(&init_user_ns); fc->global = true; return 0; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 1b56686ab178..db81cfbab9d6 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -72,8 +72,7 @@ static int sysfs_init_fs_context(struct fs_context *fc) fc->fs_private = kfc; fc->ops = &sysfs_fs_context_ops; if (netns) { - if (fc->user_ns) - put_user_ns(fc->user_ns); + put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(netns->user_ns); } fc->global = true; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..bbcdd3457eb0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2184,8 +2184,7 @@ static int cgroup_init_fs_context(struct fs_context *fc) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; - if (fc->user_ns) - put_user_ns(fc->user_ns); + put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; return 0; -- cgit v1.2.3-55-g7522 From 0c9f23797925069f9ce267c97e488e293f647c69 Mon Sep 17 00:00:00 2001 From: Steven Rostedt (VMware) Date: Mon, 20 May 2019 09:38:11 -0400 Subject: x86/ftrace: Make enable parameter bool where applicable The code modification functions have an "enable" parameter that is an "int" but used as a boolean. Switch its type to "bool" to remove the ambiguity that "int" causes. Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n.rao@linux.vnet.ibm.com Reported-by: "Naveen N. Rao" Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0927bb158ffc..ba37bcb7f92b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -370,7 +370,7 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } -static int add_breakpoints(struct dyn_ftrace *rec, int enable) +static int add_breakpoints(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_addr; int ret; @@ -478,7 +478,7 @@ static int add_update_nop(struct dyn_ftrace *rec) return add_update_code(ip, new); } -static int add_update(struct dyn_ftrace *rec, int enable) +static int add_update(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_addr; int ret; @@ -524,7 +524,7 @@ static int finish_update_nop(struct dyn_ftrace *rec) return ftrace_write(ip, new, 1); } -static int finish_update(struct dyn_ftrace *rec, int enable) +static int finish_update(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_addr; int ret; -- cgit v1.2.3-55-g7522 From 2d8d8fac3b4eab035dcd0068e1f5a746a697fbb3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:06 +0900 Subject: x86/uaccess: Allow access_ok() in irq context if pagefault_disabled WARN_ON_IN_IRQ() assumes that the access_ok() and following user memory access can sleep. But this assumption is not always correct; when the pagefault is disabled, following memory access will just returns -EFAULT and never sleep. Add pagefault_disabled() check in WARN_ON_ONCE() so that it can ignore the case we call it with disabling pagefault. For this purpose, this modified pagefault_disabled() as an inline function. Link: http://lkml.kernel.org/r/155789868664.26965.7932665824135793317.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/uaccess.h | 4 +++- include/linux/uaccess.h | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c82abd6e4ca3..9c4435307ff8 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un }) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +static inline bool pagefault_disabled(void); +# define WARN_ON_IN_IRQ() \ + WARN_ON_ONCE(!in_task() && !pagefault_disabled()) #else # define WARN_ON_IN_IRQ() #endif diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2b70130af585..5a43ef7db492 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -203,7 +203,10 @@ static inline void pagefault_enable(void) /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ -#define pagefault_disabled() (current->pagefault_disabled != 0) +static inline bool pagefault_disabled(void) +{ + return current->pagefault_disabled != 0; +} /* * The pagefault handler is in general disabled by pagefault_disable() or -- cgit v1.2.3-55-g7522 From b8a84365bbff0f860c5dc5795405430d92d68966 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Wed, 29 May 2019 10:34:02 +0200 Subject: Platform: OLPC: Make olpc_dt_compatible_match() static __init Addresses a kbuild warning: >> WARNING: vmlinux.o(.text+0x3b764): Section mismatch in reference from the function olpc_dt_compatible_match() to the function .init.text:olpc_dt_getproperty() Signed-off-by: Lubomir Rintel Reported-by: kbuild test robot Fixes: a7a9bacb9a32 (x86/platform/olpc: Use a correct version when making up a battery node) Signed-off-by: Andy Shevchenko --- arch/x86/platform/olpc/olpc_dt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 0296c5b55e6f..114c52986568 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -220,7 +220,7 @@ static u32 __init olpc_dt_get_board_revision(void) return be32_to_cpu(rev); } -int olpc_dt_compatible_match(phandle node, const char *compat) +static int __init olpc_dt_compatible_match(phandle node, const char *compat) { char buf[64], *p; int plen, len; -- cgit v1.2.3-55-g7522 From 151f4e2bdc7a04020ae5c533896fb91a16e1f501 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 07:10:36 -0300 Subject: docs: power: convert docs to ReST and rename to *.rst Convert the PM documents to ReST, in order to allow them to build with Sphinx. The conversion is actually: - add blank lines and indentation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Bjorn Helgaas Acked-by: Mark Brown Acked-by: Srivatsa S. Bhat (VMware) --- Documentation/ABI/testing/sysfs-class-powercap | 2 +- Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/cpu-freq/core.txt | 2 +- Documentation/driver-api/pm/devices.rst | 6 +- Documentation/driver-api/usb/power-management.rst | 2 +- Documentation/power/apm-acpi.rst | 36 + Documentation/power/apm-acpi.txt | 32 - Documentation/power/basic-pm-debugging.rst | 269 +++++ Documentation/power/basic-pm-debugging.txt | 254 ----- Documentation/power/charger-manager.rst | 205 ++++ Documentation/power/charger-manager.txt | 200 ---- Documentation/power/drivers-testing.rst | 51 + Documentation/power/drivers-testing.txt | 46 - Documentation/power/energy-model.rst | 147 +++ Documentation/power/energy-model.txt | 144 --- Documentation/power/freezing-of-tasks.rst | 244 +++++ Documentation/power/freezing-of-tasks.txt | 231 ---- Documentation/power/index.rst | 46 + Documentation/power/interface.rst | 79 ++ Documentation/power/interface.txt | 77 -- Documentation/power/opp.rst | 379 +++++++ Documentation/power/opp.txt | 342 ------ Documentation/power/pci.rst | 1135 ++++++++++++++++++++ Documentation/power/pci.txt | 1094 ------------------- Documentation/power/pm_qos_interface.rst | 225 ++++ Documentation/power/pm_qos_interface.txt | 212 ---- Documentation/power/power_supply_class.rst | 282 +++++ Documentation/power/power_supply_class.txt | 231 ---- Documentation/power/powercap/powercap.rst | 257 +++++ Documentation/power/powercap/powercap.txt | 236 ---- Documentation/power/regulator/consumer.rst | 229 ++++ Documentation/power/regulator/consumer.txt | 218 ---- Documentation/power/regulator/design.rst | 38 + Documentation/power/regulator/design.txt | 33 - Documentation/power/regulator/machine.rst | 97 ++ Documentation/power/regulator/machine.txt | 96 -- Documentation/power/regulator/overview.rst | 178 +++ Documentation/power/regulator/overview.txt | 171 --- Documentation/power/regulator/regulator.rst | 32 + Documentation/power/regulator/regulator.txt | 30 - Documentation/power/runtime_pm.rst | 940 ++++++++++++++++ Documentation/power/runtime_pm.txt | 928 ---------------- Documentation/power/s2ram.rst | 87 ++ Documentation/power/s2ram.txt | 85 -- Documentation/power/suspend-and-cpuhotplug.rst | 286 +++++ Documentation/power/suspend-and-cpuhotplug.txt | 274 ----- Documentation/power/suspend-and-interrupts.rst | 137 +++ Documentation/power/suspend-and-interrupts.txt | 135 --- Documentation/power/swsusp-and-swap-files.rst | 63 ++ Documentation/power/swsusp-and-swap-files.txt | 60 -- Documentation/power/swsusp-dmcrypt.rst | 140 +++ Documentation/power/swsusp-dmcrypt.txt | 138 --- Documentation/power/swsusp.rst | 501 +++++++++ Documentation/power/swsusp.txt | 446 -------- Documentation/power/tricks.rst | 29 + Documentation/power/tricks.txt | 27 - Documentation/power/userland-swsusp.rst | 191 ++++ Documentation/power/userland-swsusp.txt | 170 --- Documentation/power/video.rst | 213 ++++ Documentation/power/video.txt | 185 ---- Documentation/process/submitting-drivers.rst | 2 +- Documentation/scheduler/sched-energy.txt | 6 +- Documentation/trace/coresight-cpu-debug.txt | 2 +- .../zh_CN/process/submitting-drivers.rst | 2 +- MAINTAINERS | 4 +- arch/x86/Kconfig | 2 +- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/opp/Kconfig | 2 +- drivers/power/supply/power_supply_core.c | 2 +- include/linux/interrupt.h | 2 +- include/linux/pci.h | 2 +- include/linux/pm.h | 2 +- kernel/power/Kconfig | 6 +- net/wireless/Kconfig | 2 +- 74 files changed, 6544 insertions(+), 6123 deletions(-) create mode 100644 Documentation/power/apm-acpi.rst delete mode 100644 Documentation/power/apm-acpi.txt create mode 100644 Documentation/power/basic-pm-debugging.rst delete mode 100644 Documentation/power/basic-pm-debugging.txt create mode 100644 Documentation/power/charger-manager.rst delete mode 100644 Documentation/power/charger-manager.txt create mode 100644 Documentation/power/drivers-testing.rst delete mode 100644 Documentation/power/drivers-testing.txt create mode 100644 Documentation/power/energy-model.rst delete mode 100644 Documentation/power/energy-model.txt create mode 100644 Documentation/power/freezing-of-tasks.rst delete mode 100644 Documentation/power/freezing-of-tasks.txt create mode 100644 Documentation/power/index.rst create mode 100644 Documentation/power/interface.rst delete mode 100644 Documentation/power/interface.txt create mode 100644 Documentation/power/opp.rst delete mode 100644 Documentation/power/opp.txt create mode 100644 Documentation/power/pci.rst delete mode 100644 Documentation/power/pci.txt create mode 100644 Documentation/power/pm_qos_interface.rst delete mode 100644 Documentation/power/pm_qos_interface.txt create mode 100644 Documentation/power/power_supply_class.rst delete mode 100644 Documentation/power/power_supply_class.txt create mode 100644 Documentation/power/powercap/powercap.rst delete mode 100644 Documentation/power/powercap/powercap.txt create mode 100644 Documentation/power/regulator/consumer.rst delete mode 100644 Documentation/power/regulator/consumer.txt create mode 100644 Documentation/power/regulator/design.rst delete mode 100644 Documentation/power/regulator/design.txt create mode 100644 Documentation/power/regulator/machine.rst delete mode 100644 Documentation/power/regulator/machine.txt create mode 100644 Documentation/power/regulator/overview.rst delete mode 100644 Documentation/power/regulator/overview.txt create mode 100644 Documentation/power/regulator/regulator.rst delete mode 100644 Documentation/power/regulator/regulator.txt create mode 100644 Documentation/power/runtime_pm.rst delete mode 100644 Documentation/power/runtime_pm.txt create mode 100644 Documentation/power/s2ram.rst delete mode 100644 Documentation/power/s2ram.txt create mode 100644 Documentation/power/suspend-and-cpuhotplug.rst delete mode 100644 Documentation/power/suspend-and-cpuhotplug.txt create mode 100644 Documentation/power/suspend-and-interrupts.rst delete mode 100644 Documentation/power/suspend-and-interrupts.txt create mode 100644 Documentation/power/swsusp-and-swap-files.rst delete mode 100644 Documentation/power/swsusp-and-swap-files.txt create mode 100644 Documentation/power/swsusp-dmcrypt.rst delete mode 100644 Documentation/power/swsusp-dmcrypt.txt create mode 100644 Documentation/power/swsusp.rst delete mode 100644 Documentation/power/swsusp.txt create mode 100644 Documentation/power/tricks.rst delete mode 100644 Documentation/power/tricks.txt create mode 100644 Documentation/power/userland-swsusp.rst delete mode 100644 Documentation/power/userland-swsusp.txt create mode 100644 Documentation/power/video.rst delete mode 100644 Documentation/power/video.txt (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap index db3b3ff70d84..742dfd966592 100644 --- a/Documentation/ABI/testing/sysfs-class-powercap +++ b/Documentation/ABI/testing/sysfs-class-powercap @@ -5,7 +5,7 @@ Contact: linux-pm@vger.kernel.org Description: The powercap/ class sub directory belongs to the power cap subsystem. Refer to - Documentation/power/powercap/powercap.txt for details. + Documentation/power/powercap/powercap.rst for details. What: /sys/class/powercap/ Date: September 2013 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..7f5ca6e7c4d3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -13,7 +13,7 @@ For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force" are available - See also Documentation/power/runtime_pm.txt, pci=noacpi + See also Documentation/power/runtime_pm.rst, pci=noacpi acpi_apic_instance= [ACPI, IOAPIC] Format: @@ -223,7 +223,7 @@ acpi_sleep= [HW,ACPI] Sleep options Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering, nonvs, sci_force_enable, nobl } - See Documentation/power/video.txt for information on + See Documentation/power/video.rst for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. @@ -4108,7 +4108,7 @@ Specify the offset from the beginning of the partition given by "resume=" at which the swap header is located, in units (needed only for swap files). - See Documentation/power/swsusp-and-swap-files.txt + See Documentation/power/swsusp-and-swap-files.rst resumedelay= [HIBERNATION] Delay (in seconds) to pause before attempting to read the resume files diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt index 073f128af5a7..55193e680250 100644 --- a/Documentation/cpu-freq/core.txt +++ b/Documentation/cpu-freq/core.txt @@ -95,7 +95,7 @@ flags - flags of the cpufreq driver 3. CPUFreq Table Generation with Operating Performance Point (OPP) ================================================================== -For details about OPP, see Documentation/power/opp.txt +For details about OPP, see Documentation/power/opp.rst dev_pm_opp_init_cpufreq_table - This function provides a ready to use conversion routine to translate diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index 30835683616a..f66c7b9126ea 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto` flag is clear. For more information about the runtime power management framework, refer to -:file:`Documentation/power/runtime_pm.txt`. +:file:`Documentation/power/runtime_pm.rst`. Calling Drivers to Enter and Leave System Sleep States @@ -728,7 +728,7 @@ it into account in any way. Devices may be defined as IRQ-safe which indicates to the PM core that their runtime PM callbacks may be invoked with disabled interrupts (see -:file:`Documentation/power/runtime_pm.txt` for more information). If an +:file:`Documentation/power/runtime_pm.rst` for more information). If an IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be disallowed, unless the domain itself is defined as IRQ-safe. However, it makes sense to define a PM domain as IRQ-safe only if all the devices in it @@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM status in that case. During system-wide resume from a sleep state it's easiest to put devices into -the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`. +the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`. [Refer to that document for more information regarding this particular issue as well as for information on the device runtime power management framework in general.] diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst index 4a74cf6f2797..2525c3622cae 100644 --- a/Documentation/driver-api/usb/power-management.rst +++ b/Documentation/driver-api/usb/power-management.rst @@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we call it a "dynamic suspend" (also known as a "runtime suspend" or "selective suspend"). This document concentrates mostly on how dynamic PM is implemented in the USB subsystem, although system PM is -covered to some extent (see ``Documentation/power/*.txt`` for more +covered to some extent (see ``Documentation/power/*.rst`` for more information about system PM). System PM support is present only if the kernel was built with diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst new file mode 100644 index 000000000000..5b90d947126d --- /dev/null +++ b/Documentation/power/apm-acpi.rst @@ -0,0 +1,36 @@ +============ +APM or ACPI? +============ + +If you have a relatively recent x86 mobile, desktop, or server system, +odds are it supports either Advanced Power Management (APM) or +Advanced Configuration and Power Interface (ACPI). ACPI is the newer +of the two technologies and puts power management in the hands of the +operating system, allowing for more intelligent power management than +is possible with BIOS controlled APM. + +The best way to determine which, if either, your system supports is to +build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is +enabled by default). If a working ACPI implementation is found, the +ACPI driver will override and disable APM, otherwise the APM driver +will be used. + +No, sorry, you cannot have both ACPI and APM enabled and running at +once. Some people with broken ACPI or broken APM implementations +would like to use both to get a full set of working features, but you +simply cannot mix and match the two. Only one power management +interface can be in control of the machine at once. Think about it.. + +User-space Daemons +------------------ +Both APM and ACPI rely on user-space daemons, apmd and acpid +respectively, to be completely functional. Obtain both of these +daemons from your Linux distribution or from the Internet (see below) +and be sure that they are started sometime in the system boot process. +Go ahead and start both. If ACPI or APM is not available on your +system the associated daemon will exit gracefully. + + ===== ======================================= + apmd http://ftp.debian.org/pool/main/a/apmd/ + acpid http://acpid.sf.net/ + ===== ======================================= diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt deleted file mode 100644 index 6cc423d3662e..000000000000 --- a/Documentation/power/apm-acpi.txt +++ /dev/null @@ -1,32 +0,0 @@ -APM or ACPI? ------------- -If you have a relatively recent x86 mobile, desktop, or server system, -odds are it supports either Advanced Power Management (APM) or -Advanced Configuration and Power Interface (ACPI). ACPI is the newer -of the two technologies and puts power management in the hands of the -operating system, allowing for more intelligent power management than -is possible with BIOS controlled APM. - -The best way to determine which, if either, your system supports is to -build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is -enabled by default). If a working ACPI implementation is found, the -ACPI driver will override and disable APM, otherwise the APM driver -will be used. - -No, sorry, you cannot have both ACPI and APM enabled and running at -once. Some people with broken ACPI or broken APM implementations -would like to use both to get a full set of working features, but you -simply cannot mix and match the two. Only one power management -interface can be in control of the machine at once. Think about it.. - -User-space Daemons ------------------- -Both APM and ACPI rely on user-space daemons, apmd and acpid -respectively, to be completely functional. Obtain both of these -daemons from your Linux distribution or from the Internet (see below) -and be sure that they are started sometime in the system boot process. -Go ahead and start both. If ACPI or APM is not available on your -system the associated daemon will exit gracefully. - - apmd: http://ftp.debian.org/pool/main/a/apmd/ - acpid: http://acpid.sf.net/ diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst new file mode 100644 index 000000000000..69862e759c30 --- /dev/null +++ b/Documentation/power/basic-pm-debugging.rst @@ -0,0 +1,269 @@ +================================= +Debugging hibernation and suspend +================================= + + (C) 2007 Rafael J. Wysocki , GPL + +1. Testing hibernation (aka suspend to disk or STD) +=================================================== + +To check if hibernation works, you can try to hibernate in the "reboot" mode:: + + # echo reboot > /sys/power/disk + # echo disk > /sys/power/state + +and the system should create a hibernation image, reboot, resume and get back to +the command prompt where you have started the transition. If that happens, +hibernation is most likely to work correctly. Still, you need to repeat the +test at least a couple of times in a row for confidence. [This is necessary, +because some problems only show up on a second attempt at suspending and +resuming the system.] Moreover, hibernating in the "reboot" and "shutdown" +modes causes the PM core to skip some platform-related callbacks which on ACPI +systems might be necessary to make hibernation work. Thus, if your machine +fails to hibernate or resume in the "reboot" mode, you should try the +"platform" mode:: + + # echo platform > /sys/power/disk + # echo disk > /sys/power/state + +which is the default and recommended mode of hibernation. + +Unfortunately, the "platform" mode of hibernation does not work on some systems +with broken BIOSes. In such cases the "shutdown" mode of hibernation might +work:: + + # echo shutdown > /sys/power/disk + # echo disk > /sys/power/state + +(it is similar to the "reboot" mode, but it requires you to press the power +button to make the system resume). + +If neither "platform" nor "shutdown" hibernation mode works, you will need to +identify what goes wrong. + +a) Test modes of hibernation +---------------------------- + +To find out why hibernation fails on your system, you can use a special testing +facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then, +there is the file /sys/power/pm_test that can be used to make the hibernation +core run in a test mode. There are 5 test modes available: + +freezer + - test the freezing of processes + +devices + - test the freezing of processes and suspending of devices + +platform + - test the freezing of processes, suspending of devices and platform + global control methods [1]_ + +processors + - test the freezing of processes, suspending of devices, platform + global control methods [1]_ and the disabling of nonboot CPUs + +core + - test the freezing of processes, suspending of devices, platform global + control methods\ [1]_, the disabling of nonboot CPUs and suspending + of platform/system devices + +.. [1] + + the platform global control methods are only available on ACPI systems + and are only tested if the hibernation mode is set to "platform" + +To use one of them it is necessary to write the corresponding string to +/sys/power/pm_test (eg. "devices" to test the freezing of processes and +suspending devices) and issue the standard hibernation commands. For example, +to use the "devices" test mode along with the "platform" mode of hibernation, +you should do the following:: + + # echo devices > /sys/power/pm_test + # echo platform > /sys/power/disk + # echo disk > /sys/power/state + +Then, the kernel will try to freeze processes, suspend devices, wait a few +seconds (5 by default, but configurable by the suspend.pm_test_delay module +parameter), resume devices and thaw processes. If "platform" is written to +/sys/power/pm_test , then after suspending devices the kernel will additionally +invoke the global control methods (eg. ACPI global control methods) used to +prepare the platform firmware for hibernation. Next, it will wait a +configurable number of seconds and invoke the platform (eg. ACPI) global +methods used to cancel hibernation etc. + +Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal +hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test +contains a space-separated list of all available tests (including "none" that +represents the normal functionality) in which the current test level is +indicated by square brackets. + +Generally, as you can see, each test level is more "invasive" than the previous +one and the "core" level tests the hardware and drivers as deeply as possible +without creating a hibernation image. Obviously, if the "devices" test fails, +the "platform" test will fail as well and so on. Thus, as a rule of thumb, you +should try the test modes starting from "freezer", through "devices", "platform" +and "processors" up to "core" (repeat the test on each level a couple of times +to make sure that any random factors are avoided). + +If the "freezer" test fails, there is a task that cannot be frozen (in that case +it usually is possible to identify the offending task by analysing the output of +dmesg obtained after the failing test). Failure at this level usually means +that there is a problem with the tasks freezer subsystem that should be +reported. + +If the "devices" test fails, most likely there is a driver that cannot suspend +or resume its device (in the latter case the system may hang or become unstable +after the test, so please take that into consideration). To find this driver, +you can carry out a binary search according to the rules: + +- if the test fails, unload a half of the drivers currently loaded and repeat + (that would probably involve rebooting the system, so always note what drivers + have been loaded before the test), +- if the test succeeds, load a half of the drivers you have unloaded most + recently and repeat. + +Once you have found the failing driver (there can be more than just one of +them), you have to unload it every time before hibernation. In that case please +make sure to report the problem with the driver. + +It is also possible that the "devices" test will still fail after you have +unloaded all modules. In that case, you may want to look in your kernel +configuration for the drivers that can be compiled as modules (and test again +with these drivers compiled as modules). You may also try to use some special +kernel command line options such as "noapic", "noacpi" or even "acpi=off". + +If the "platform" test fails, there is a problem with the handling of the +platform (eg. ACPI) firmware on your system. In that case the "platform" mode +of hibernation is not likely to work. You can try the "shutdown" mode, but that +is rather a poor man's workaround. + +If the "processors" test fails, the disabling/enabling of nonboot CPUs does not +work (of course, this only may be an issue on SMP systems) and the problem +should be reported. In that case you can also try to switch the nonboot CPUs +off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and +see if that works. + +If the "core" test fails, which means that suspending of the system/platform +devices has failed (these devices are suspended on one CPU with interrupts off), +the problem is most probably hardware-related and serious, so it should be +reported. + +A failure of any of the "platform", "processors" or "core" tests may cause your +system to hang or become unstable, so please beware. Such a failure usually +indicates a serious problem that very well may be related to the hardware, but +please report it anyway. + +b) Testing minimal configuration +-------------------------------- + +If all of the hibernation test modes work, you can boot the system with the +"init=/bin/bash" command line parameter and attempt to hibernate in the +"reboot", "shutdown" and "platform" modes. If that does not work, there +probably is a problem with a driver statically compiled into the kernel and you +can try to compile more drivers as modules, so that they can be tested +individually. Otherwise, there is a problem with a modular driver and you can +find it by loading a half of the modules you normally use and binary searching +in accordance with the algorithm: +- if there are n modules loaded and the attempt to suspend and resume fails, +unload n/2 of the modules and try again (that would probably involve rebooting +the system), +- if there are n modules loaded and the attempt to suspend and resume succeeds, +load n/2 modules more and try again. + +Again, if you find the offending module(s), it(they) must be unloaded every time +before hibernation, and please report the problem with it(them). + +c) Using the "test_resume" hibernation option +--------------------------------------------- + +/sys/power/disk generally tells the kernel what to do after creating a +hibernation image. One of the available options is "test_resume" which +causes the just created image to be used for immediate restoration. Namely, +after doing:: + + # echo test_resume > /sys/power/disk + # echo disk > /sys/power/state + +a hibernation image will be created and a resume from it will be triggered +immediately without involving the platform firmware in any way. + +That test can be used to check if failures to resume from hibernation are +related to bad interactions with the platform firmware. That is, if the above +works every time, but resume from actual hibernation does not work or is +unreliable, the platform firmware may be responsible for the failures. + +On architectures and platforms that support using different kernels to restore +hibernation images (that is, the kernel used to read the image from storage and +load it into memory is different from the one included in the image) or support +kernel address space randomization, it also can be used to check if failures +to resume may be related to the differences between the restore and image +kernels. + +d) Advanced debugging +--------------------- + +In case that hibernation does not work on your system even in the minimal +configuration and compiling more drivers as modules is not practical or some +modules cannot be unloaded, you can use one of the more advanced debugging +techniques to find the problem. First, if there is a serial port in your box, +you can boot the kernel with the 'no_console_suspend' parameter and try to log +kernel messages using the serial console. This may provide you with some +information about the reasons of the suspend (resume) failure. Alternatively, +it may be possible to use a FireWire port for debugging with firescope +(http://v3.sk/~lkundrak/firescope/). On x86 it is also possible to +use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst . + +2. Testing suspend to RAM (STR) +=============================== + +To verify that the STR works, it is generally more convenient to use the s2ram +tool available from http://suspend.sf.net and documented at +http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK). + +Namely, after writing "freezer", "devices", "platform", "processors", or "core" +into /sys/power/pm_test (available if the kernel is compiled with +CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding +to given string. The STR test modes are defined in the same way as for +hibernation, so please refer to Section 1 for more information about them. In +particular, the "core" test allows you to test everything except for the actual +invocation of the platform firmware in order to put the system into the sleep +state. + +Among other things, the testing with the help of /sys/power/pm_test may allow +you to identify drivers that fail to suspend or resume their devices. They +should be unloaded every time before an STR transition. + +Next, you can follow the instructions at S2RAM_LINK to test the system, but if +it does not work "out of the box", you may need to boot it with +"init=/bin/bash" and test s2ram in the minimal configuration. In that case, +you may be able to search for failing drivers by following the procedure +analogous to the one described in section 1. If you find some failing drivers, +you will have to unload them every time before an STR transition (ie. before +you run s2ram), and please report the problems with them. + +There is a debugfs entry which shows the suspend to RAM statistics. Here is an +example of its output:: + + # mount -t debugfs none /sys/kernel/debug + # cat /sys/kernel/debug/suspend_stats + success: 20 + fail: 5 + failed_freeze: 0 + failed_prepare: 0 + failed_suspend: 5 + failed_suspend_noirq: 0 + failed_resume: 0 + failed_resume_noirq: 0 + failures: + last_failed_dev: alarm + adc + last_failed_errno: -16 + -16 + last_failed_step: suspend + suspend + +Field success means the success number of suspend to RAM, and field fail means +the failure number. Others are the failure number of different steps of suspend +to RAM. suspend_stats just lists the last 2 failed devices, error number and +failed step of suspend. diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt deleted file mode 100644 index 708f87f78a75..000000000000 --- a/Documentation/power/basic-pm-debugging.txt +++ /dev/null @@ -1,254 +0,0 @@ -Debugging hibernation and suspend - (C) 2007 Rafael J. Wysocki , GPL - -1. Testing hibernation (aka suspend to disk or STD) - -To check if hibernation works, you can try to hibernate in the "reboot" mode: - -# echo reboot > /sys/power/disk -# echo disk > /sys/power/state - -and the system should create a hibernation image, reboot, resume and get back to -the command prompt where you have started the transition. If that happens, -hibernation is most likely to work correctly. Still, you need to repeat the -test at least a couple of times in a row for confidence. [This is necessary, -because some problems only show up on a second attempt at suspending and -resuming the system.] Moreover, hibernating in the "reboot" and "shutdown" -modes causes the PM core to skip some platform-related callbacks which on ACPI -systems might be necessary to make hibernation work. Thus, if your machine fails -to hibernate or resume in the "reboot" mode, you should try the "platform" mode: - -# echo platform > /sys/power/disk -# echo disk > /sys/power/state - -which is the default and recommended mode of hibernation. - -Unfortunately, the "platform" mode of hibernation does not work on some systems -with broken BIOSes. In such cases the "shutdown" mode of hibernation might -work: - -# echo shutdown > /sys/power/disk -# echo disk > /sys/power/state - -(it is similar to the "reboot" mode, but it requires you to press the power -button to make the system resume). - -If neither "platform" nor "shutdown" hibernation mode works, you will need to -identify what goes wrong. - -a) Test modes of hibernation - -To find out why hibernation fails on your system, you can use a special testing -facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then, -there is the file /sys/power/pm_test that can be used to make the hibernation -core run in a test mode. There are 5 test modes available: - -freezer -- test the freezing of processes - -devices -- test the freezing of processes and suspending of devices - -platform -- test the freezing of processes, suspending of devices and platform - global control methods(*) - -processors -- test the freezing of processes, suspending of devices, platform - global control methods(*) and the disabling of nonboot CPUs - -core -- test the freezing of processes, suspending of devices, platform global - control methods(*), the disabling of nonboot CPUs and suspending of - platform/system devices - -(*) the platform global control methods are only available on ACPI systems - and are only tested if the hibernation mode is set to "platform" - -To use one of them it is necessary to write the corresponding string to -/sys/power/pm_test (eg. "devices" to test the freezing of processes and -suspending devices) and issue the standard hibernation commands. For example, -to use the "devices" test mode along with the "platform" mode of hibernation, -you should do the following: - -# echo devices > /sys/power/pm_test -# echo platform > /sys/power/disk -# echo disk > /sys/power/state - -Then, the kernel will try to freeze processes, suspend devices, wait a few -seconds (5 by default, but configurable by the suspend.pm_test_delay module -parameter), resume devices and thaw processes. If "platform" is written to -/sys/power/pm_test , then after suspending devices the kernel will additionally -invoke the global control methods (eg. ACPI global control methods) used to -prepare the platform firmware for hibernation. Next, it will wait a -configurable number of seconds and invoke the platform (eg. ACPI) global -methods used to cancel hibernation etc. - -Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal -hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test -contains a space-separated list of all available tests (including "none" that -represents the normal functionality) in which the current test level is -indicated by square brackets. - -Generally, as you can see, each test level is more "invasive" than the previous -one and the "core" level tests the hardware and drivers as deeply as possible -without creating a hibernation image. Obviously, if the "devices" test fails, -the "platform" test will fail as well and so on. Thus, as a rule of thumb, you -should try the test modes starting from "freezer", through "devices", "platform" -and "processors" up to "core" (repeat the test on each level a couple of times -to make sure that any random factors are avoided). - -If the "freezer" test fails, there is a task that cannot be frozen (in that case -it usually is possible to identify the offending task by analysing the output of -dmesg obtained after the failing test). Failure at this level usually means -that there is a problem with the tasks freezer subsystem that should be -reported. - -If the "devices" test fails, most likely there is a driver that cannot suspend -or resume its device (in the latter case the system may hang or become unstable -after the test, so please take that into consideration). To find this driver, -you can carry out a binary search according to the rules: -- if the test fails, unload a half of the drivers currently loaded and repeat -(that would probably involve rebooting the system, so always note what drivers -have been loaded before the test), -- if the test succeeds, load a half of the drivers you have unloaded most -recently and repeat. - -Once you have found the failing driver (there can be more than just one of -them), you have to unload it every time before hibernation. In that case please -make sure to report the problem with the driver. - -It is also possible that the "devices" test will still fail after you have -unloaded all modules. In that case, you may want to look in your kernel -configuration for the drivers that can be compiled as modules (and test again -with these drivers compiled as modules). You may also try to use some special -kernel command line options such as "noapic", "noacpi" or even "acpi=off". - -If the "platform" test fails, there is a problem with the handling of the -platform (eg. ACPI) firmware on your system. In that case the "platform" mode -of hibernation is not likely to work. You can try the "shutdown" mode, but that -is rather a poor man's workaround. - -If the "processors" test fails, the disabling/enabling of nonboot CPUs does not -work (of course, this only may be an issue on SMP systems) and the problem -should be reported. In that case you can also try to switch the nonboot CPUs -off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and -see if that works. - -If the "core" test fails, which means that suspending of the system/platform -devices has failed (these devices are suspended on one CPU with interrupts off), -the problem is most probably hardware-related and serious, so it should be -reported. - -A failure of any of the "platform", "processors" or "core" tests may cause your -system to hang or become unstable, so please beware. Such a failure usually -indicates a serious problem that very well may be related to the hardware, but -please report it anyway. - -b) Testing minimal configuration - -If all of the hibernation test modes work, you can boot the system with the -"init=/bin/bash" command line parameter and attempt to hibernate in the -"reboot", "shutdown" and "platform" modes. If that does not work, there -probably is a problem with a driver statically compiled into the kernel and you -can try to compile more drivers as modules, so that they can be tested -individually. Otherwise, there is a problem with a modular driver and you can -find it by loading a half of the modules you normally use and binary searching -in accordance with the algorithm: -- if there are n modules loaded and the attempt to suspend and resume fails, -unload n/2 of the modules and try again (that would probably involve rebooting -the system), -- if there are n modules loaded and the attempt to suspend and resume succeeds, -load n/2 modules more and try again. - -Again, if you find the offending module(s), it(they) must be unloaded every time -before hibernation, and please report the problem with it(them). - -c) Using the "test_resume" hibernation option - -/sys/power/disk generally tells the kernel what to do after creating a -hibernation image. One of the available options is "test_resume" which -causes the just created image to be used for immediate restoration. Namely, -after doing: - -# echo test_resume > /sys/power/disk -# echo disk > /sys/power/state - -a hibernation image will be created and a resume from it will be triggered -immediately without involving the platform firmware in any way. - -That test can be used to check if failures to resume from hibernation are -related to bad interactions with the platform firmware. That is, if the above -works every time, but resume from actual hibernation does not work or is -unreliable, the platform firmware may be responsible for the failures. - -On architectures and platforms that support using different kernels to restore -hibernation images (that is, the kernel used to read the image from storage and -load it into memory is different from the one included in the image) or support -kernel address space randomization, it also can be used to check if failures -to resume may be related to the differences between the restore and image -kernels. - -d) Advanced debugging - -In case that hibernation does not work on your system even in the minimal -configuration and compiling more drivers as modules is not practical or some -modules cannot be unloaded, you can use one of the more advanced debugging -techniques to find the problem. First, if there is a serial port in your box, -you can boot the kernel with the 'no_console_suspend' parameter and try to log -kernel messages using the serial console. This may provide you with some -information about the reasons of the suspend (resume) failure. Alternatively, -it may be possible to use a FireWire port for debugging with firescope -(http://v3.sk/~lkundrak/firescope/). On x86 it is also possible to -use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt . - -2. Testing suspend to RAM (STR) - -To verify that the STR works, it is generally more convenient to use the s2ram -tool available from http://suspend.sf.net and documented at -http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK). - -Namely, after writing "freezer", "devices", "platform", "processors", or "core" -into /sys/power/pm_test (available if the kernel is compiled with -CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding -to given string. The STR test modes are defined in the same way as for -hibernation, so please refer to Section 1 for more information about them. In -particular, the "core" test allows you to test everything except for the actual -invocation of the platform firmware in order to put the system into the sleep -state. - -Among other things, the testing with the help of /sys/power/pm_test may allow -you to identify drivers that fail to suspend or resume their devices. They -should be unloaded every time before an STR transition. - -Next, you can follow the instructions at S2RAM_LINK to test the system, but if -it does not work "out of the box", you may need to boot it with -"init=/bin/bash" and test s2ram in the minimal configuration. In that case, -you may be able to search for failing drivers by following the procedure -analogous to the one described in section 1. If you find some failing drivers, -you will have to unload them every time before an STR transition (ie. before -you run s2ram), and please report the problems with them. - -There is a debugfs entry which shows the suspend to RAM statistics. Here is an -example of its output. - # mount -t debugfs none /sys/kernel/debug - # cat /sys/kernel/debug/suspend_stats - success: 20 - fail: 5 - failed_freeze: 0 - failed_prepare: 0 - failed_suspend: 5 - failed_suspend_noirq: 0 - failed_resume: 0 - failed_resume_noirq: 0 - failures: - last_failed_dev: alarm - adc - last_failed_errno: -16 - -16 - last_failed_step: suspend - suspend -Field success means the success number of suspend to RAM, and field fail means -the failure number. Others are the failure number of different steps of suspend -to RAM. suspend_stats just lists the last 2 failed devices, error number and -failed step of suspend. diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst new file mode 100644 index 000000000000..84fab9376792 --- /dev/null +++ b/Documentation/power/charger-manager.rst @@ -0,0 +1,205 @@ +=============== +Charger Manager +=============== + + (C) 2011 MyungJoo Ham , GPL + +Charger Manager provides in-kernel battery charger management that +requires temperature monitoring during suspend-to-RAM state +and where each battery may have multiple chargers attached and the userland +wants to look at the aggregated information of the multiple chargers. + +Charger Manager is a platform_driver with power-supply-class entries. +An instance of Charger Manager (a platform-device created with Charger-Manager) +represents an independent battery with chargers. If there are multiple +batteries with their own chargers acting independently in a system, +the system may need multiple instances of Charger Manager. + +1. Introduction +=============== + +Charger Manager supports the following: + +* Support for multiple chargers (e.g., a device with USB, AC, and solar panels) + A system may have multiple chargers (or power sources) and some of + they may be activated at the same time. Each charger may have its + own power-supply-class and each power-supply-class can provide + different information about the battery status. This framework + aggregates charger-related information from multiple sources and + shows combined information as a single power-supply-class. + +* Support for in suspend-to-RAM polling (with suspend_again callback) + While the battery is being charged and the system is in suspend-to-RAM, + we may need to monitor the battery health by looking at the ambient or + battery temperature. We can accomplish this by waking up the system + periodically. However, such a method wakes up devices unnecessarily for + monitoring the battery health and tasks, and user processes that are + supposed to be kept suspended. That, in turn, incurs unnecessary power + consumption and slow down charging process. Or even, such peak power + consumption can stop chargers in the middle of charging + (external power input < device power consumption), which not + only affects the charging time, but the lifespan of the battery. + + Charger Manager provides a function "cm_suspend_again" that can be + used as suspend_again callback of platform_suspend_ops. If the platform + requires tasks other than cm_suspend_again, it may implement its own + suspend_again callback that calls cm_suspend_again in the middle. + Normally, the platform will need to resume and suspend some devices + that are used by Charger Manager. + +* Support for premature full-battery event handling + If the battery voltage drops by "fullbatt_vchkdrop_uV" after + "fullbatt_vchkdrop_ms" from the full-battery event, the framework + restarts charging. This check is also performed while suspended by + setting wakeup time accordingly and using suspend_again. + +* Support for uevent-notify + With the charger-related events, the device sends + notification to users with UEVENT. + +2. Global Charger-Manager Data related with suspend_again +========================================================= +In order to setup Charger Manager with suspend-again feature +(in-suspend monitoring), the user should provide charger_global_desc +with setup_charger_manager(`struct charger_global_desc *`). +This charger_global_desc data for in-suspend monitoring is global +as the name suggests. Thus, the user needs to provide only once even +if there are multiple batteries. If there are multiple batteries, the +multiple instances of Charger Manager share the same charger_global_desc +and it will manage in-suspend monitoring for all instances of Charger Manager. + +The user needs to provide all the three entries to `struct charger_global_desc` +properly in order to activate in-suspend monitoring: + +`char *rtc_name;` + The name of rtc (e.g., "rtc0") used to wakeup the system from + suspend for Charger Manager. The alarm interrupt (AIE) of the rtc + should be able to wake up the system from suspend. Charger Manager + saves and restores the alarm value and use the previously-defined + alarm if it is going to go off earlier than Charger Manager so that + Charger Manager does not interfere with previously-defined alarms. + +`bool (*rtc_only_wakeup)(void);` + This callback should let CM know whether + the wakeup-from-suspend is caused only by the alarm of "rtc" in the + same struct. If there is any other wakeup source triggered the + wakeup, it should return false. If the "rtc" is the only wakeup + reason, it should return true. + +`bool assume_timer_stops_in_suspend;` + if true, Charger Manager assumes that + the timer (CM uses jiffies as timer) stops during suspend. Then, CM + assumes that the suspend-duration is same as the alarm length. + + +3. How to setup suspend_again +============================= +Charger Manager provides a function "extern bool cm_suspend_again(void)". +When cm_suspend_again is called, it monitors every battery. The suspend_ops +callback of the system's platform_suspend_ops can call cm_suspend_again +function to know whether Charger Manager wants to suspend again or not. +If there are no other devices or tasks that want to use suspend_again +feature, the platform_suspend_ops may directly refer to cm_suspend_again +for its suspend_again callback. + +The cm_suspend_again() returns true (meaning "I want to suspend again") +if the system was woken up by Charger Manager and the polling +(in-suspend monitoring) results in "normal". + +4. Charger-Manager Data (struct charger_desc) +============================================= +For each battery charged independently from other batteries (if a series of +batteries are charged by a single charger, they are counted as one independent +battery), an instance of Charger Manager is attached to it. The following + +struct charger_desc elements: + +`char *psy_name;` + The power-supply-class name of the battery. Default is + "battery" if psy_name is NULL. Users can access the psy entries + at "/sys/class/power_supply/[psy_name]/". + +`enum polling_modes polling_mode;` + CM_POLL_DISABLE: + do not poll this battery. + CM_POLL_ALWAYS: + always poll this battery. + CM_POLL_EXTERNAL_POWER_ONLY: + poll this battery if and only if an external power + source is attached. + CM_POLL_CHARGING_ONLY: + poll this battery if and only if the battery is being charged. + +`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;` + If both have non-zero values, Charger Manager will check the + battery voltage drop fullbatt_vchkdrop_ms after the battery is fully + charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger + Manager will try to recharge the battery by disabling and enabling + chargers. Recharge with voltage drop condition only (without delay + condition) is needed to be implemented with hardware interrupts from + fuel gauges or charger devices/chips. + +`unsigned int fullbatt_uV;` + If specified with a non-zero value, Charger Manager assumes + that the battery is full (capacity = 100) if the battery is not being + charged and the battery voltage is equal to or greater than + fullbatt_uV. + +`unsigned int polling_interval_ms;` + Required polling interval in ms. Charger Manager will poll + this battery every polling_interval_ms or more frequently. + +`enum data_source battery_present;` + CM_BATTERY_PRESENT: + assume that the battery exists. + CM_NO_BATTERY: + assume that the battery does not exists. + CM_FUEL_GAUGE: + get battery presence information from fuel gauge. + CM_CHARGER_STAT: + get battery presence from chargers. + +`char **psy_charger_stat;` + An array ending with NULL that has power-supply-class names of + chargers. Each power-supply-class should provide "PRESENT" (if + battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an + external power source is attached or not), and "STATUS" (shows whether + the battery is {"FULL" or not FULL} or {"FULL", "Charging", + "Discharging", "NotCharging"}). + +`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;` + Regulators representing the chargers in the form for + regulator framework's bulk functions. + +`char *psy_fuel_gauge;` + Power-supply-class name of the fuel gauge. + +`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;` + This callback returns 0 if the temperature is safe for charging, + a positive number if it is too hot to charge, and a negative number + if it is too cold to charge. With the variable mC, the callback returns + the temperature in 1/1000 of centigrade. + The source of temperature can be battery or ambient one according to + the value of measure_battery_temp. + + +5. Notify Charger-Manager of charger events: cm_notify_event() +============================================================== +If there is an charger event is required to notify +Charger Manager, a charger device driver that triggers the event can call +cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager. +In the function, psy is the charger driver's power_supply pointer, which is +associated with Charger-Manager. The parameter "type" +is the same as irq's type (enum cm_event_types). The event message "msg" is +optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS". + +6. Other Considerations +======================= + +At the charger/battery-related events such as battery-pulled-out, +charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped, +and others critical to chargers, the system should be configured to wake up. +At least the following should wake up the system from a suspend: +a) charger-on/off b) external-power-in/out c) battery-in/out (while charging) + +It is usually accomplished by configuring the PMIC as a wakeup source. diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt deleted file mode 100644 index 9ff1105e58d6..000000000000 --- a/Documentation/power/charger-manager.txt +++ /dev/null @@ -1,200 +0,0 @@ -Charger Manager - (C) 2011 MyungJoo Ham , GPL - -Charger Manager provides in-kernel battery charger management that -requires temperature monitoring during suspend-to-RAM state -and where each battery may have multiple chargers attached and the userland -wants to look at the aggregated information of the multiple chargers. - -Charger Manager is a platform_driver with power-supply-class entries. -An instance of Charger Manager (a platform-device created with Charger-Manager) -represents an independent battery with chargers. If there are multiple -batteries with their own chargers acting independently in a system, -the system may need multiple instances of Charger Manager. - -1. Introduction -=============== - -Charger Manager supports the following: - -* Support for multiple chargers (e.g., a device with USB, AC, and solar panels) - A system may have multiple chargers (or power sources) and some of - they may be activated at the same time. Each charger may have its - own power-supply-class and each power-supply-class can provide - different information about the battery status. This framework - aggregates charger-related information from multiple sources and - shows combined information as a single power-supply-class. - -* Support for in suspend-to-RAM polling (with suspend_again callback) - While the battery is being charged and the system is in suspend-to-RAM, - we may need to monitor the battery health by looking at the ambient or - battery temperature. We can accomplish this by waking up the system - periodically. However, such a method wakes up devices unnecessarily for - monitoring the battery health and tasks, and user processes that are - supposed to be kept suspended. That, in turn, incurs unnecessary power - consumption and slow down charging process. Or even, such peak power - consumption can stop chargers in the middle of charging - (external power input < device power consumption), which not - only affects the charging time, but the lifespan of the battery. - - Charger Manager provides a function "cm_suspend_again" that can be - used as suspend_again callback of platform_suspend_ops. If the platform - requires tasks other than cm_suspend_again, it may implement its own - suspend_again callback that calls cm_suspend_again in the middle. - Normally, the platform will need to resume and suspend some devices - that are used by Charger Manager. - -* Support for premature full-battery event handling - If the battery voltage drops by "fullbatt_vchkdrop_uV" after - "fullbatt_vchkdrop_ms" from the full-battery event, the framework - restarts charging. This check is also performed while suspended by - setting wakeup time accordingly and using suspend_again. - -* Support for uevent-notify - With the charger-related events, the device sends - notification to users with UEVENT. - -2. Global Charger-Manager Data related with suspend_again -======================================================== -In order to setup Charger Manager with suspend-again feature -(in-suspend monitoring), the user should provide charger_global_desc -with setup_charger_manager(struct charger_global_desc *). -This charger_global_desc data for in-suspend monitoring is global -as the name suggests. Thus, the user needs to provide only once even -if there are multiple batteries. If there are multiple batteries, the -multiple instances of Charger Manager share the same charger_global_desc -and it will manage in-suspend monitoring for all instances of Charger Manager. - -The user needs to provide all the three entries properly in order to activate -in-suspend monitoring: - -struct charger_global_desc { - -char *rtc_name; - : The name of rtc (e.g., "rtc0") used to wakeup the system from - suspend for Charger Manager. The alarm interrupt (AIE) of the rtc - should be able to wake up the system from suspend. Charger Manager - saves and restores the alarm value and use the previously-defined - alarm if it is going to go off earlier than Charger Manager so that - Charger Manager does not interfere with previously-defined alarms. - -bool (*rtc_only_wakeup)(void); - : This callback should let CM know whether - the wakeup-from-suspend is caused only by the alarm of "rtc" in the - same struct. If there is any other wakeup source triggered the - wakeup, it should return false. If the "rtc" is the only wakeup - reason, it should return true. - -bool assume_timer_stops_in_suspend; - : if true, Charger Manager assumes that - the timer (CM uses jiffies as timer) stops during suspend. Then, CM - assumes that the suspend-duration is same as the alarm length. -}; - -3. How to setup suspend_again -============================= -Charger Manager provides a function "extern bool cm_suspend_again(void)". -When cm_suspend_again is called, it monitors every battery. The suspend_ops -callback of the system's platform_suspend_ops can call cm_suspend_again -function to know whether Charger Manager wants to suspend again or not. -If there are no other devices or tasks that want to use suspend_again -feature, the platform_suspend_ops may directly refer to cm_suspend_again -for its suspend_again callback. - -The cm_suspend_again() returns true (meaning "I want to suspend again") -if the system was woken up by Charger Manager and the polling -(in-suspend monitoring) results in "normal". - -4. Charger-Manager Data (struct charger_desc) -============================================= -For each battery charged independently from other batteries (if a series of -batteries are charged by a single charger, they are counted as one independent -battery), an instance of Charger Manager is attached to it. - -struct charger_desc { - -char *psy_name; - : The power-supply-class name of the battery. Default is - "battery" if psy_name is NULL. Users can access the psy entries - at "/sys/class/power_supply/[psy_name]/". - -enum polling_modes polling_mode; - : CM_POLL_DISABLE: do not poll this battery. - CM_POLL_ALWAYS: always poll this battery. - CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if - an external power source is attached. - CM_POLL_CHARGING_ONLY: poll this battery if and only if the - battery is being charged. - -unsigned int fullbatt_vchkdrop_ms; -unsigned int fullbatt_vchkdrop_uV; - : If both have non-zero values, Charger Manager will check the - battery voltage drop fullbatt_vchkdrop_ms after the battery is fully - charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger - Manager will try to recharge the battery by disabling and enabling - chargers. Recharge with voltage drop condition only (without delay - condition) is needed to be implemented with hardware interrupts from - fuel gauges or charger devices/chips. - -unsigned int fullbatt_uV; - : If specified with a non-zero value, Charger Manager assumes - that the battery is full (capacity = 100) if the battery is not being - charged and the battery voltage is equal to or greater than - fullbatt_uV. - -unsigned int polling_interval_ms; - : Required polling interval in ms. Charger Manager will poll - this battery every polling_interval_ms or more frequently. - -enum data_source battery_present; - : CM_BATTERY_PRESENT: assume that the battery exists. - CM_NO_BATTERY: assume that the battery does not exists. - CM_FUEL_GAUGE: get battery presence information from fuel gauge. - CM_CHARGER_STAT: get battery presence from chargers. - -char **psy_charger_stat; - : An array ending with NULL that has power-supply-class names of - chargers. Each power-supply-class should provide "PRESENT" (if - battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an - external power source is attached or not), and "STATUS" (shows whether - the battery is {"FULL" or not FULL} or {"FULL", "Charging", - "Discharging", "NotCharging"}). - -int num_charger_regulators; -struct regulator_bulk_data *charger_regulators; - : Regulators representing the chargers in the form for - regulator framework's bulk functions. - -char *psy_fuel_gauge; - : Power-supply-class name of the fuel gauge. - -int (*temperature_out_of_range)(int *mC); -bool measure_battery_temp; - : This callback returns 0 if the temperature is safe for charging, - a positive number if it is too hot to charge, and a negative number - if it is too cold to charge. With the variable mC, the callback returns - the temperature in 1/1000 of centigrade. - The source of temperature can be battery or ambient one according to - the value of measure_battery_temp. -}; - -5. Notify Charger-Manager of charger events: cm_notify_event() -========================================================= -If there is an charger event is required to notify -Charger Manager, a charger device driver that triggers the event can call -cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager. -In the function, psy is the charger driver's power_supply pointer, which is -associated with Charger-Manager. The parameter "type" -is the same as irq's type (enum cm_event_types). The event message "msg" is -optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS". - -6. Other Considerations -======================= - -At the charger/battery-related events such as battery-pulled-out, -charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped, -and others critical to chargers, the system should be configured to wake up. -At least the following should wake up the system from a suspend: -a) charger-on/off b) external-power-in/out c) battery-in/out (while charging) - -It is usually accomplished by configuring the PMIC as a wakeup source. diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst new file mode 100644 index 000000000000..e53f1999fc39 --- /dev/null +++ b/Documentation/power/drivers-testing.rst @@ -0,0 +1,51 @@ +==================================================== +Testing suspend and resume support in device drivers +==================================================== + + (C) 2007 Rafael J. Wysocki , GPL + +1. Preparing the test system +============================ + +Unfortunately, to effectively test the support for the system-wide suspend and +resume transitions in a driver, it is necessary to suspend and resume a fully +functional system with this driver loaded. Moreover, that should be done +several times, preferably several times in a row, and separately for hibernation +(aka suspend to disk or STD) and suspend to RAM (STR), because each of these +cases involves slightly different operations and different interactions with +the machine's BIOS. + +Of course, for this purpose the test system has to be known to suspend and +resume without the driver being tested. Thus, if possible, you should first +resolve all suspend/resume-related problems in the test system before you start +testing the new driver. Please see Documentation/power/basic-pm-debugging.rst +for more information about the debugging of suspend/resume functionality. + +2. Testing the driver +===================== + +Once you have resolved the suspend/resume-related problems with your test system +without the new driver, you are ready to test it: + +a) Build the driver as a module, load it and try the test modes of hibernation + (see: Documentation/power/basic-pm-debugging.rst, 1). + +b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and + "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1). + +c) Compile the driver directly into the kernel and try the test modes of + hibernation. + +d) Attempt to hibernate with the driver compiled directly into the kernel + in the "reboot", "shutdown" and "platform" modes. + +e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst, + 2). [As far as the STR tests are concerned, it should not matter whether or + not the driver is built as a module.] + +f) Attempt to suspend to RAM using the s2ram tool with the driver loaded + (see: Documentation/power/basic-pm-debugging.rst, 2). + +Each of the above tests should be repeated several times and the STD tests +should be mixed with the STR tests. If any of them fails, the driver cannot be +regarded as suspend/resume-safe. diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt deleted file mode 100644 index 638afdf4d6b8..000000000000 --- a/Documentation/power/drivers-testing.txt +++ /dev/null @@ -1,46 +0,0 @@ -Testing suspend and resume support in device drivers - (C) 2007 Rafael J. Wysocki , GPL - -1. Preparing the test system - -Unfortunately, to effectively test the support for the system-wide suspend and -resume transitions in a driver, it is necessary to suspend and resume a fully -functional system with this driver loaded. Moreover, that should be done -several times, preferably several times in a row, and separately for hibernation -(aka suspend to disk or STD) and suspend to RAM (STR), because each of these -cases involves slightly different operations and different interactions with -the machine's BIOS. - -Of course, for this purpose the test system has to be known to suspend and -resume without the driver being tested. Thus, if possible, you should first -resolve all suspend/resume-related problems in the test system before you start -testing the new driver. Please see Documentation/power/basic-pm-debugging.txt -for more information about the debugging of suspend/resume functionality. - -2. Testing the driver - -Once you have resolved the suspend/resume-related problems with your test system -without the new driver, you are ready to test it: - -a) Build the driver as a module, load it and try the test modes of hibernation - (see: Documentation/power/basic-pm-debugging.txt, 1). - -b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and - "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1). - -c) Compile the driver directly into the kernel and try the test modes of - hibernation. - -d) Attempt to hibernate with the driver compiled directly into the kernel - in the "reboot", "shutdown" and "platform" modes. - -e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt, - 2). [As far as the STR tests are concerned, it should not matter whether or - not the driver is built as a module.] - -f) Attempt to suspend to RAM using the s2ram tool with the driver loaded - (see: Documentation/power/basic-pm-debugging.txt, 2). - -Each of the above tests should be repeated several times and the STD tests -should be mixed with the STR tests. If any of them fails, the driver cannot be -regarded as suspend/resume-safe. diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst new file mode 100644 index 000000000000..90a345d57ae9 --- /dev/null +++ b/Documentation/power/energy-model.rst @@ -0,0 +1,147 @@ +==================== +Energy Model of CPUs +==================== + +1. Overview +----------- + +The Energy Model (EM) framework serves as an interface between drivers knowing +the power consumed by CPUs at various performance levels, and the kernel +subsystems willing to use that information to make energy-aware decisions. + +The source of the information about the power consumed by CPUs can vary greatly +from one platform to another. These power costs can be estimated using +devicetree data in some cases. In others, the firmware will know better. +Alternatively, userspace might be best positioned. And so on. In order to avoid +each and every client subsystem to re-implement support for each and every +possible source of information on its own, the EM framework intervenes as an +abstraction layer which standardizes the format of power cost tables in the +kernel, hence enabling to avoid redundant work. + +The figure below depicts an example of drivers (Arm-specific here, but the +approach is applicable to any architecture) providing power costs to the EM +framework, and interested clients reading the data from it:: + + +---------------+ +-----------------+ +---------------+ + | Thermal (IPA) | | Scheduler (EAS) | | Other | + +---------------+ +-----------------+ +---------------+ + | | em_pd_energy() | + | | em_cpu_get() | + +---------+ | +---------+ + | | | + v v v + +---------------------+ + | Energy Model | + | Framework | + +---------------------+ + ^ ^ ^ + | | | em_register_perf_domain() + +----------+ | +---------+ + | | | + +---------------+ +---------------+ +--------------+ + | cpufreq-dt | | arm_scmi | | Other | + +---------------+ +---------------+ +--------------+ + ^ ^ ^ + | | | + +--------------+ +---------------+ +--------------+ + | Device Tree | | Firmware | | ? | + +--------------+ +---------------+ +--------------+ + +The EM framework manages power cost tables per 'performance domain' in the +system. A performance domain is a group of CPUs whose performance is scaled +together. Performance domains generally have a 1-to-1 mapping with CPUFreq +policies. All CPUs in a performance domain are required to have the same +micro-architecture. CPUs in different performance domains can have different +micro-architectures. + + +2. Core APIs +------------ + +2.1 Config options +^^^^^^^^^^^^^^^^^^ + +CONFIG_ENERGY_MODEL must be enabled to use the EM framework. + + +2.2 Registration of performance domains +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Drivers are expected to register performance domains into the EM framework by +calling the following API:: + + int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb); + +Drivers must specify the CPUs of the performance domains using the cpumask +argument, and provide a callback function returning tuples +for each capacity state. The callback function provided by the driver is free +to fetch data from any relevant location (DT, firmware, ...), and by any mean +deemed necessary. See Section 3. for an example of driver implementing this +callback, and kernel/power/energy_model.c for further documentation on this +API. + + +2.3 Accessing performance domains +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Subsystems interested in the energy model of a CPU can retrieve it using the +em_cpu_get() API. The energy model tables are allocated once upon creation of +the performance domains, and kept in memory untouched. + +The energy consumed by a performance domain can be estimated using the +em_pd_energy() API. The estimation is performed assuming that the schedutil +CPUfreq governor is in use. + +More details about the above APIs can be found in include/linux/energy_model.h. + + +3. Example driver +----------------- + +This section provides a simple example of a CPUFreq driver registering a +performance domain in the Energy Model framework using the (fake) 'foo' +protocol. The driver implements an est_power() function to be provided to the +EM framework:: + + -> drivers/cpufreq/foo_cpufreq.c + + 01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu) + 02 { + 03 long freq, power; + 04 + 05 /* Use the 'foo' protocol to ceil the frequency */ + 06 freq = foo_get_freq_ceil(cpu, *KHz); + 07 if (freq < 0); + 08 return freq; + 09 + 10 /* Estimate the power cost for the CPU at the relevant freq. */ + 11 power = foo_estimate_power(cpu, freq); + 12 if (power < 0); + 13 return power; + 14 + 15 /* Return the values to the EM framework */ + 16 *mW = power; + 17 *KHz = freq; + 18 + 19 return 0; + 20 } + 21 + 22 static int foo_cpufreq_init(struct cpufreq_policy *policy) + 23 { + 24 struct em_data_callback em_cb = EM_DATA_CB(est_power); + 25 int nr_opp, ret; + 26 + 27 /* Do the actual CPUFreq init work ... */ + 28 ret = do_foo_cpufreq_init(policy); + 29 if (ret) + 30 return ret; + 31 + 32 /* Find the number of OPPs for this policy */ + 33 nr_opp = foo_get_nr_opp(policy); + 34 + 35 /* And register the new performance domain */ + 36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + 37 + 38 return 0; + 39 } diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt deleted file mode 100644 index a2b0ae4c76bd..000000000000 --- a/Documentation/power/energy-model.txt +++ /dev/null @@ -1,144 +0,0 @@ - ==================== - Energy Model of CPUs - ==================== - -1. Overview ------------ - -The Energy Model (EM) framework serves as an interface between drivers knowing -the power consumed by CPUs at various performance levels, and the kernel -subsystems willing to use that information to make energy-aware decisions. - -The source of the information about the power consumed by CPUs can vary greatly -from one platform to another. These power costs can be estimated using -devicetree data in some cases. In others, the firmware will know better. -Alternatively, userspace might be best positioned. And so on. In order to avoid -each and every client subsystem to re-implement support for each and every -possible source of information on its own, the EM framework intervenes as an -abstraction layer which standardizes the format of power cost tables in the -kernel, hence enabling to avoid redundant work. - -The figure below depicts an example of drivers (Arm-specific here, but the -approach is applicable to any architecture) providing power costs to the EM -framework, and interested clients reading the data from it. - - +---------------+ +-----------------+ +---------------+ - | Thermal (IPA) | | Scheduler (EAS) | | Other | - +---------------+ +-----------------+ +---------------+ - | | em_pd_energy() | - | | em_cpu_get() | - +---------+ | +---------+ - | | | - v v v - +---------------------+ - | Energy Model | - | Framework | - +---------------------+ - ^ ^ ^ - | | | em_register_perf_domain() - +----------+ | +---------+ - | | | - +---------------+ +---------------+ +--------------+ - | cpufreq-dt | | arm_scmi | | Other | - +---------------+ +---------------+ +--------------+ - ^ ^ ^ - | | | - +--------------+ +---------------+ +--------------+ - | Device Tree | | Firmware | | ? | - +--------------+ +---------------+ +--------------+ - -The EM framework manages power cost tables per 'performance domain' in the -system. A performance domain is a group of CPUs whose performance is scaled -together. Performance domains generally have a 1-to-1 mapping with CPUFreq -policies. All CPUs in a performance domain are required to have the same -micro-architecture. CPUs in different performance domains can have different -micro-architectures. - - -2. Core APIs ------------- - - 2.1 Config options - -CONFIG_ENERGY_MODEL must be enabled to use the EM framework. - - - 2.2 Registration of performance domains - -Drivers are expected to register performance domains into the EM framework by -calling the following API: - - int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb); - -Drivers must specify the CPUs of the performance domains using the cpumask -argument, and provide a callback function returning tuples -for each capacity state. The callback function provided by the driver is free -to fetch data from any relevant location (DT, firmware, ...), and by any mean -deemed necessary. See Section 3. for an example of driver implementing this -callback, and kernel/power/energy_model.c for further documentation on this -API. - - - 2.3 Accessing performance domains - -Subsystems interested in the energy model of a CPU can retrieve it using the -em_cpu_get() API. The energy model tables are allocated once upon creation of -the performance domains, and kept in memory untouched. - -The energy consumed by a performance domain can be estimated using the -em_pd_energy() API. The estimation is performed assuming that the schedutil -CPUfreq governor is in use. - -More details about the above APIs can be found in include/linux/energy_model.h. - - -3. Example driver ------------------ - -This section provides a simple example of a CPUFreq driver registering a -performance domain in the Energy Model framework using the (fake) 'foo' -protocol. The driver implements an est_power() function to be provided to the -EM framework. - - -> drivers/cpufreq/foo_cpufreq.c - -01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu) -02 { -03 long freq, power; -04 -05 /* Use the 'foo' protocol to ceil the frequency */ -06 freq = foo_get_freq_ceil(cpu, *KHz); -07 if (freq < 0); -08 return freq; -09 -10 /* Estimate the power cost for the CPU at the relevant freq. */ -11 power = foo_estimate_power(cpu, freq); -12 if (power < 0); -13 return power; -14 -15 /* Return the values to the EM framework */ -16 *mW = power; -17 *KHz = freq; -18 -19 return 0; -20 } -21 -22 static int foo_cpufreq_init(struct cpufreq_policy *policy) -23 { -24 struct em_data_callback em_cb = EM_DATA_CB(est_power); -25 int nr_opp, ret; -26 -27 /* Do the actual CPUFreq init work ... */ -28 ret = do_foo_cpufreq_init(policy); -29 if (ret) -30 return ret; -31 -32 /* Find the number of OPPs for this policy */ -33 nr_opp = foo_get_nr_opp(policy); -34 -35 /* And register the new performance domain */ -36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb); -37 -38 return 0; -39 } diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst new file mode 100644 index 000000000000..ef110fe55e82 --- /dev/null +++ b/Documentation/power/freezing-of-tasks.rst @@ -0,0 +1,244 @@ +================= +Freezing of tasks +================= + +(C) 2007 Rafael J. Wysocki , GPL + +I. What is the freezing of tasks? +================================= + +The freezing of tasks is a mechanism by which user space processes and some +kernel threads are controlled during hibernation or system-wide suspend (on some +architectures). + +II. How does it work? +===================== + +There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN +and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have +PF_NOFREEZE unset (all user space processes and some kernel threads) are +regarded as 'freezable' and treated in a special way before the system enters a +suspend state as well as before a hibernation image is created (in what follows +we only consider hibernation, but the description also applies to suspend). + +Namely, as the first step of the hibernation procedure the function +freeze_processes() (defined in kernel/power/process.c) is called. A system-wide +variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate +whether the system is to undergo a freezing operation. And freeze_processes() +sets this variable. After this, it executes try_to_freeze_tasks() that sends a +fake signal to all user space processes, and wakes up all the kernel threads. +All freezable tasks must react to that by calling try_to_freeze(), which +results in a call to __refrigerator() (defined in kernel/freezer.c), which sets +the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes +it loop until PF_FROZEN is cleared for it. Then, we say that the task is +'frozen' and therefore the set of functions handling this mechanism is referred +to as 'the freezer' (these functions are defined in kernel/power/process.c, +kernel/freezer.c & include/linux/freezer.h). User space processes are generally +frozen before kernel threads. + +__refrigerator() must not be called directly. Instead, use the +try_to_freeze() function (defined in include/linux/freezer.h), that checks +if the task is to be frozen and makes the task enter __refrigerator(). + +For user space processes try_to_freeze() is called automatically from the +signal-handling code, but the freezable kernel threads need to call it +explicitly in suitable places or use the wait_event_freezable() or +wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) +that combine interruptible sleep with checking if the task is to be frozen and +calling try_to_freeze(). The main loop of a freezable kernel thread may look +like the following one:: + + set_freezable(); + do { + hub_events(); + wait_event_freezable(khubd_wait, + !list_empty(&hub_event_list) || + kthread_should_stop()); + } while (!kthread_should_stop() || !list_empty(&hub_event_list)); + +(from drivers/usb/core/hub.c::hub_thread()). + +If a freezable kernel thread fails to call try_to_freeze() after the freezer has +initiated a freezing operation, the freezing of tasks will fail and the entire +hibernation operation will be cancelled. For this reason, freezable kernel +threads must call try_to_freeze() somewhere or use one of the +wait_event_freezable() and wait_event_freezable_timeout() macros. + +After the system memory state has been restored from a hibernation image and +devices have been reinitialized, the function thaw_processes() is called in +order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that +have been frozen leave __refrigerator() and continue running. + + +Rationale behind the functions dealing with freezing and thawing of tasks +------------------------------------------------------------------------- + +freeze_processes(): + - freezes only userspace tasks + +freeze_kernel_threads(): + - freezes all tasks (including kernel threads) because we can't freeze + kernel threads without freezing userspace tasks + +thaw_kernel_threads(): + - thaws only kernel threads; this is particularly useful if we need to do + anything special in between thawing of kernel threads and thawing of + userspace tasks, or if we want to postpone the thawing of userspace tasks + +thaw_processes(): + - thaws all tasks (including kernel threads) because we can't thaw userspace + tasks without thawing kernel threads + + +III. Which kernel threads are freezable? +======================================== + +Kernel threads are not freezable by default. However, a kernel thread may clear +PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE +directly is not allowed). From this point it is regarded as freezable +and must call try_to_freeze() in a suitable place. + +IV. Why do we do that? +====================== + +Generally speaking, there is a couple of reasons to use the freezing of tasks: + +1. The principal reason is to prevent filesystems from being damaged after + hibernation. At the moment we have no simple means of checkpointing + filesystems, so if there are any modifications made to filesystem data and/or + metadata on disks, we cannot bring them back to the state from before the + modifications. At the same time each hibernation image contains some + filesystem-related information that must be consistent with the state of the + on-disk data and metadata after the system memory state has been restored + from the image (otherwise the filesystems will be damaged in a nasty way, + usually making them almost impossible to repair). We therefore freeze + tasks that might cause the on-disk filesystems' data and metadata to be + modified after the hibernation image has been created and before the + system is finally powered off. The majority of these are user space + processes, but if any of the kernel threads may cause something like this + to happen, they have to be freezable. + +2. Next, to create the hibernation image we need to free a sufficient amount of + memory (approximately 50% of available RAM) and we need to do that before + devices are deactivated, because we generally need them for swapping out. + Then, after the memory for the image has been freed, we don't want tasks + to allocate additional memory and we prevent them from doing that by + freezing them earlier. [Of course, this also means that device drivers + should not allocate substantial amounts of memory from their .suspend() + callbacks before hibernation, but this is a separate issue.] + +3. The third reason is to prevent user space processes and some kernel threads + from interfering with the suspending and resuming of devices. A user space + process running on a second CPU while we are suspending devices may, for + example, be troublesome and without the freezing of tasks we would need some + safeguards against race conditions that might occur in such a case. + +Although Linus Torvalds doesn't like the freezing of tasks, he said this in one +of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608): + +"RJW:> Why we freeze tasks at all or why we freeze kernel threads? + +Linus: In many ways, 'at all'. + +I **do** realize the IO request queue issues, and that we cannot actually do +s2ram with some devices in the middle of a DMA. So we want to be able to +avoid *that*, there's no question about that. And I suspect that stopping +user threads and then waiting for a sync is practically one of the easier +ways to do so. + +So in practice, the 'at all' may become a 'why freeze kernel threads?' and +freezing user threads I don't find really objectionable." + +Still, there are kernel threads that may want to be freezable. For example, if +a kernel thread that belongs to a device driver accesses the device directly, it +in principle needs to know when the device is suspended, so that it doesn't try +to access it at that time. However, if the kernel thread is freezable, it will +be frozen before the driver's .suspend() callback is executed and it will be +thawed after the driver's .resume() callback has run, so it won't be accessing +the device while it's suspended. + +4. Another reason for freezing tasks is to prevent user space processes from + realizing that hibernation (or suspend) operation takes place. Ideally, user + space processes should not notice that such a system-wide operation has + occurred and should continue running without any problems after the restore + (or resume from suspend). Unfortunately, in the most general case this + is quite difficult to achieve without the freezing of tasks. Consider, + for example, a process that depends on all CPUs being online while it's + running. Since we need to disable nonboot CPUs during the hibernation, + if this process is not frozen, it may notice that the number of CPUs has + changed and may start to work incorrectly because of that. + +V. Are there any problems related to the freezing of tasks? +=========================================================== + +Yes, there are. + +First of all, the freezing of kernel threads may be tricky if they depend one +on another. For example, if kernel thread A waits for a completion (in the +TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B +and B is frozen in the meantime, then A will be blocked until B is thawed, which +may be undesirable. That's why kernel threads are not freezable by default. + +Second, there are the following two problems related to the freezing of user +space processes: + +1. Putting processes into an uninterruptible sleep distorts the load average. +2. Now that we have FUSE, plus the framework for doing device drivers in + userspace, it gets even more complicated because some userspace processes are + now doing the sorts of things that kernel threads do + (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html). + +The problem 1. seems to be fixable, although it hasn't been fixed so far. The +other one is more serious, but it seems that we can work around it by using +hibernation (and suspend) notifiers (in that case, though, we won't be able to +avoid the realization by the user space processes that the hibernation is taking +place). + +There are also problems that the freezing of tasks tends to expose, although +they are not directly related to it. For example, if request_firmware() is +called from a device driver's .resume() routine, it will timeout and eventually +fail, because the user land process that should respond to the request is frozen +at this point. So, seemingly, the failure is due to the freezing of tasks. +Suppose, however, that the firmware file is located on a filesystem accessible +only through another device that hasn't been resumed yet. In that case, +request_firmware() will fail regardless of whether or not the freezing of tasks +is used. Consequently, the problem is not really related to the freezing of +tasks, since it generally exists anyway. + +A driver must have all firmwares it may need in RAM before suspend() is called. +If keeping them is not practical, for example due to their size, they must be +requested early enough using the suspend notifier API described in +Documentation/driver-api/pm/notifiers.rst. + +VI. Are there any precautions to be taken to prevent freezing failures? +======================================================================= + +Yes, there are. + +First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code +from system-wide sleep such as suspend/hibernation is not encouraged. +If possible, that piece of code must instead hook onto the suspend/hibernation +notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code +(kernel/cpu.c) for an example. + +However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary, +it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since +that could lead to freezing failures, because if the suspend/hibernate code +successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed +to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE +state. As a consequence, the freezer would not be able to freeze that task, +leading to freezing failure. + +However, the [un]lock_system_sleep() APIs are safe to use in this scenario, +since they ask the freezer to skip freezing this task, since it is anyway +"frozen enough" as it is blocked on 'system_transition_mutex', which will be released +only after the entire suspend/hibernation sequence is complete. +So, to summarize, use [un]lock_system_sleep() instead of directly using +mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures. + +V. Miscellaneous +================ + +/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze +all user space processes or all freezable kernel threads, in unit of millisecond. +The default value is 20000, with range of unsigned integer. diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt deleted file mode 100644 index cd283190855a..000000000000 --- a/Documentation/power/freezing-of-tasks.txt +++ /dev/null @@ -1,231 +0,0 @@ -Freezing of tasks - (C) 2007 Rafael J. Wysocki , GPL - -I. What is the freezing of tasks? - -The freezing of tasks is a mechanism by which user space processes and some -kernel threads are controlled during hibernation or system-wide suspend (on some -architectures). - -II. How does it work? - -There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN -and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have -PF_NOFREEZE unset (all user space processes and some kernel threads) are -regarded as 'freezable' and treated in a special way before the system enters a -suspend state as well as before a hibernation image is created (in what follows -we only consider hibernation, but the description also applies to suspend). - -Namely, as the first step of the hibernation procedure the function -freeze_processes() (defined in kernel/power/process.c) is called. A system-wide -variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate -whether the system is to undergo a freezing operation. And freeze_processes() -sets this variable. After this, it executes try_to_freeze_tasks() that sends a -fake signal to all user space processes, and wakes up all the kernel threads. -All freezable tasks must react to that by calling try_to_freeze(), which -results in a call to __refrigerator() (defined in kernel/freezer.c), which sets -the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes -it loop until PF_FROZEN is cleared for it. Then, we say that the task is -'frozen' and therefore the set of functions handling this mechanism is referred -to as 'the freezer' (these functions are defined in kernel/power/process.c, -kernel/freezer.c & include/linux/freezer.h). User space processes are generally -frozen before kernel threads. - -__refrigerator() must not be called directly. Instead, use the -try_to_freeze() function (defined in include/linux/freezer.h), that checks -if the task is to be frozen and makes the task enter __refrigerator(). - -For user space processes try_to_freeze() is called automatically from the -signal-handling code, but the freezable kernel threads need to call it -explicitly in suitable places or use the wait_event_freezable() or -wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) -that combine interruptible sleep with checking if the task is to be frozen and -calling try_to_freeze(). The main loop of a freezable kernel thread may look -like the following one: - - set_freezable(); - do { - hub_events(); - wait_event_freezable(khubd_wait, - !list_empty(&hub_event_list) || - kthread_should_stop()); - } while (!kthread_should_stop() || !list_empty(&hub_event_list)); - -(from drivers/usb/core/hub.c::hub_thread()). - -If a freezable kernel thread fails to call try_to_freeze() after the freezer has -initiated a freezing operation, the freezing of tasks will fail and the entire -hibernation operation will be cancelled. For this reason, freezable kernel -threads must call try_to_freeze() somewhere or use one of the -wait_event_freezable() and wait_event_freezable_timeout() macros. - -After the system memory state has been restored from a hibernation image and -devices have been reinitialized, the function thaw_processes() is called in -order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that -have been frozen leave __refrigerator() and continue running. - - -Rationale behind the functions dealing with freezing and thawing of tasks: -------------------------------------------------------------------------- - -freeze_processes(): - - freezes only userspace tasks - -freeze_kernel_threads(): - - freezes all tasks (including kernel threads) because we can't freeze - kernel threads without freezing userspace tasks - -thaw_kernel_threads(): - - thaws only kernel threads; this is particularly useful if we need to do - anything special in between thawing of kernel threads and thawing of - userspace tasks, or if we want to postpone the thawing of userspace tasks - -thaw_processes(): - - thaws all tasks (including kernel threads) because we can't thaw userspace - tasks without thawing kernel threads - - -III. Which kernel threads are freezable? - -Kernel threads are not freezable by default. However, a kernel thread may clear -PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE -directly is not allowed). From this point it is regarded as freezable -and must call try_to_freeze() in a suitable place. - -IV. Why do we do that? - -Generally speaking, there is a couple of reasons to use the freezing of tasks: - -1. The principal reason is to prevent filesystems from being damaged after -hibernation. At the moment we have no simple means of checkpointing -filesystems, so if there are any modifications made to filesystem data and/or -metadata on disks, we cannot bring them back to the state from before the -modifications. At the same time each hibernation image contains some -filesystem-related information that must be consistent with the state of the -on-disk data and metadata after the system memory state has been restored from -the image (otherwise the filesystems will be damaged in a nasty way, usually -making them almost impossible to repair). We therefore freeze tasks that might -cause the on-disk filesystems' data and metadata to be modified after the -hibernation image has been created and before the system is finally powered off. -The majority of these are user space processes, but if any of the kernel threads -may cause something like this to happen, they have to be freezable. - -2. Next, to create the hibernation image we need to free a sufficient amount of -memory (approximately 50% of available RAM) and we need to do that before -devices are deactivated, because we generally need them for swapping out. Then, -after the memory for the image has been freed, we don't want tasks to allocate -additional memory and we prevent them from doing that by freezing them earlier. -[Of course, this also means that device drivers should not allocate substantial -amounts of memory from their .suspend() callbacks before hibernation, but this -is a separate issue.] - -3. The third reason is to prevent user space processes and some kernel threads -from interfering with the suspending and resuming of devices. A user space -process running on a second CPU while we are suspending devices may, for -example, be troublesome and without the freezing of tasks we would need some -safeguards against race conditions that might occur in such a case. - -Although Linus Torvalds doesn't like the freezing of tasks, he said this in one -of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608): - -"RJW:> Why we freeze tasks at all or why we freeze kernel threads? - -Linus: In many ways, 'at all'. - -I _do_ realize the IO request queue issues, and that we cannot actually do -s2ram with some devices in the middle of a DMA. So we want to be able to -avoid *that*, there's no question about that. And I suspect that stopping -user threads and then waiting for a sync is practically one of the easier -ways to do so. - -So in practice, the 'at all' may become a 'why freeze kernel threads?' and -freezing user threads I don't find really objectionable." - -Still, there are kernel threads that may want to be freezable. For example, if -a kernel thread that belongs to a device driver accesses the device directly, it -in principle needs to know when the device is suspended, so that it doesn't try -to access it at that time. However, if the kernel thread is freezable, it will -be frozen before the driver's .suspend() callback is executed and it will be -thawed after the driver's .resume() callback has run, so it won't be accessing -the device while it's suspended. - -4. Another reason for freezing tasks is to prevent user space processes from -realizing that hibernation (or suspend) operation takes place. Ideally, user -space processes should not notice that such a system-wide operation has occurred -and should continue running without any problems after the restore (or resume -from suspend). Unfortunately, in the most general case this is quite difficult -to achieve without the freezing of tasks. Consider, for example, a process -that depends on all CPUs being online while it's running. Since we need to -disable nonboot CPUs during the hibernation, if this process is not frozen, it -may notice that the number of CPUs has changed and may start to work incorrectly -because of that. - -V. Are there any problems related to the freezing of tasks? - -Yes, there are. - -First of all, the freezing of kernel threads may be tricky if they depend one -on another. For example, if kernel thread A waits for a completion (in the -TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B -and B is frozen in the meantime, then A will be blocked until B is thawed, which -may be undesirable. That's why kernel threads are not freezable by default. - -Second, there are the following two problems related to the freezing of user -space processes: -1. Putting processes into an uninterruptible sleep distorts the load average. -2. Now that we have FUSE, plus the framework for doing device drivers in -userspace, it gets even more complicated because some userspace processes are -now doing the sorts of things that kernel threads do -(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html). - -The problem 1. seems to be fixable, although it hasn't been fixed so far. The -other one is more serious, but it seems that we can work around it by using -hibernation (and suspend) notifiers (in that case, though, we won't be able to -avoid the realization by the user space processes that the hibernation is taking -place). - -There are also problems that the freezing of tasks tends to expose, although -they are not directly related to it. For example, if request_firmware() is -called from a device driver's .resume() routine, it will timeout and eventually -fail, because the user land process that should respond to the request is frozen -at this point. So, seemingly, the failure is due to the freezing of tasks. -Suppose, however, that the firmware file is located on a filesystem accessible -only through another device that hasn't been resumed yet. In that case, -request_firmware() will fail regardless of whether or not the freezing of tasks -is used. Consequently, the problem is not really related to the freezing of -tasks, since it generally exists anyway. - -A driver must have all firmwares it may need in RAM before suspend() is called. -If keeping them is not practical, for example due to their size, they must be -requested early enough using the suspend notifier API described in -Documentation/driver-api/pm/notifiers.rst. - -VI. Are there any precautions to be taken to prevent freezing failures? - -Yes, there are. - -First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code -from system-wide sleep such as suspend/hibernation is not encouraged. -If possible, that piece of code must instead hook onto the suspend/hibernation -notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code -(kernel/cpu.c) for an example. - -However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary, -it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since -that could lead to freezing failures, because if the suspend/hibernate code -successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed -to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE -state. As a consequence, the freezer would not be able to freeze that task, -leading to freezing failure. - -However, the [un]lock_system_sleep() APIs are safe to use in this scenario, -since they ask the freezer to skip freezing this task, since it is anyway -"frozen enough" as it is blocked on 'system_transition_mutex', which will be released -only after the entire suspend/hibernation sequence is complete. -So, to summarize, use [un]lock_system_sleep() instead of directly using -mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures. - -V. Miscellaneous -/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze -all user space processes or all freezable kernel threads, in unit of millisecond. -The default value is 20000, with range of unsigned integer. diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst new file mode 100644 index 000000000000..20415f21e48a --- /dev/null +++ b/Documentation/power/index.rst @@ -0,0 +1,46 @@ +:orphan: + +================ +Power Management +================ + +.. toctree:: + :maxdepth: 1 + + apm-acpi + basic-pm-debugging + charger-manager + drivers-testing + energy-model + freezing-of-tasks + interface + opp + pci + pm_qos_interface + power_supply_class + runtime_pm + s2ram + suspend-and-cpuhotplug + suspend-and-interrupts + swsusp-and-swap-files + swsusp-dmcrypt + swsusp + video + tricks + + userland-swsusp + + powercap/powercap + + regulator/consumer + regulator/design + regulator/machine + regulator/overview + regulator/regulator + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst new file mode 100644 index 000000000000..8d270ed27228 --- /dev/null +++ b/Documentation/power/interface.rst @@ -0,0 +1,79 @@ +=========================================== +Power Management Interface for System Sleep +=========================================== + +Copyright (c) 2016 Intel Corp., Rafael J. Wysocki + +The power management subsystem provides userspace with a unified sysfs interface +for system sleep regardless of the underlying system architecture or platform. +The interface is located in the /sys/power/ directory (assuming that sysfs is +mounted at /sys). + +/sys/power/state is the system sleep state control file. + +Reading from it returns a list of supported sleep states, encoded as: + +- 'freeze' (Suspend-to-Idle) +- 'standby' (Power-On Suspend) +- 'mem' (Suspend-to-RAM) +- 'disk' (Suspend-to-Disk) + +Suspend-to-Idle is always supported. Suspend-to-Disk is always supported +too as long the kernel has been configured to support hibernation at all +(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support +for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the +platform. + +If one of the strings listed in /sys/power/state is written to it, the system +will attempt to transition into the corresponding sleep state. Refer to +Documentation/admin-guide/pm/sleep-states.rst for a description of each of +those states. + +/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk). +Specifically, it tells the kernel what to do after creating a hibernation image. + +Reading from it returns a list of supported options encoded as: + +- 'platform' (put the system into sleep using a platform-provided method) +- 'shutdown' (shut the system down) +- 'reboot' (reboot the system) +- 'suspend' (trigger a Suspend-to-RAM transition) +- 'test_resume' (resume-after-hibernation test mode) + +The currently selected option is printed in square brackets. + +The 'platform' option is only available if the platform provides a special +mechanism to put the system to sleep after creating a hibernation image (ACPI +does that, for example). The 'suspend' option is available if Suspend-to-RAM +is supported. Refer to Documentation/power/basic-pm-debugging.rst for the +description of the 'test_resume' option. + +To select an option, write the string representing it to /sys/power/disk. + +/sys/power/image_size controls the size of hibernation images. + +It can be written a string representing a non-negative integer that will be +used as a best-effort upper limit of the image size, in bytes. The hibernation +core will do its best to ensure that the image size will not exceed that number. +However, if that turns out to be impossible to achieve, a hibernation image will +still be created and its size will be as small as possible. In particular, +writing '0' to this file will enforce hibernation images to be as small as +possible. + +Reading from this file returns the current image size limit, which is set to +around 2/5 of available RAM by default. + +/sys/power/pm_trace controls the PM trace mechanism saving the last suspend +or resume event point in the RTC across reboots. + +It helps to debug hard lockups or reboots due to device driver failures that +occur during system suspend or resume (which is more common) more effectively. + +If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume +event point in turn will be stored in the RTC memory (overwriting the actual +RTC information), so it will survive a system crash if one occurs right after +storing it and it can be used later to identify the driver that caused the crash +to happen (see Documentation/power/s2ram.rst for more information). + +Initially it contains '0' which may be changed to '1' by writing a string +representing a nonzero integer into it. diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt deleted file mode 100644 index 27df7f98668a..000000000000 --- a/Documentation/power/interface.txt +++ /dev/null @@ -1,77 +0,0 @@ -Power Management Interface for System Sleep - -Copyright (c) 2016 Intel Corp., Rafael J. Wysocki - -The power management subsystem provides userspace with a unified sysfs interface -for system sleep regardless of the underlying system architecture or platform. -The interface is located in the /sys/power/ directory (assuming that sysfs is -mounted at /sys). - -/sys/power/state is the system sleep state control file. - -Reading from it returns a list of supported sleep states, encoded as: - -'freeze' (Suspend-to-Idle) -'standby' (Power-On Suspend) -'mem' (Suspend-to-RAM) -'disk' (Suspend-to-Disk) - -Suspend-to-Idle is always supported. Suspend-to-Disk is always supported -too as long the kernel has been configured to support hibernation at all -(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support -for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the -platform. - -If one of the strings listed in /sys/power/state is written to it, the system -will attempt to transition into the corresponding sleep state. Refer to -Documentation/admin-guide/pm/sleep-states.rst for a description of each of -those states. - -/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk). -Specifically, it tells the kernel what to do after creating a hibernation image. - -Reading from it returns a list of supported options encoded as: - -'platform' (put the system into sleep using a platform-provided method) -'shutdown' (shut the system down) -'reboot' (reboot the system) -'suspend' (trigger a Suspend-to-RAM transition) -'test_resume' (resume-after-hibernation test mode) - -The currently selected option is printed in square brackets. - -The 'platform' option is only available if the platform provides a special -mechanism to put the system to sleep after creating a hibernation image (ACPI -does that, for example). The 'suspend' option is available if Suspend-to-RAM -is supported. Refer to Documentation/power/basic-pm-debugging.txt for the -description of the 'test_resume' option. - -To select an option, write the string representing it to /sys/power/disk. - -/sys/power/image_size controls the size of hibernation images. - -It can be written a string representing a non-negative integer that will be -used as a best-effort upper limit of the image size, in bytes. The hibernation -core will do its best to ensure that the image size will not exceed that number. -However, if that turns out to be impossible to achieve, a hibernation image will -still be created and its size will be as small as possible. In particular, -writing '0' to this file will enforce hibernation images to be as small as -possible. - -Reading from this file returns the current image size limit, which is set to -around 2/5 of available RAM by default. - -/sys/power/pm_trace controls the PM trace mechanism saving the last suspend -or resume event point in the RTC across reboots. - -It helps to debug hard lockups or reboots due to device driver failures that -occur during system suspend or resume (which is more common) more effectively. - -If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume -event point in turn will be stored in the RTC memory (overwriting the actual -RTC information), so it will survive a system crash if one occurs right after -storing it and it can be used later to identify the driver that caused the crash -to happen (see Documentation/power/s2ram.txt for more information). - -Initially it contains '0' which may be changed to '1' by writing a string -representing a nonzero integer into it. diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst new file mode 100644 index 000000000000..b3cf1def9dee --- /dev/null +++ b/Documentation/power/opp.rst @@ -0,0 +1,379 @@ +========================================== +Operating Performance Points (OPP) Library +========================================== + +(C) 2009-2010 Nishanth Menon , Texas Instruments Incorporated + +.. Contents + + 1. Introduction + 2. Initial OPP List Registration + 3. OPP Search Functions + 4. OPP Availability Control Functions + 5. OPP Data Retrieval Functions + 6. Data Structures + +1. Introduction +=============== + +1.1 What is an Operating Performance Point (OPP)? +------------------------------------------------- + +Complex SoCs of today consists of a multiple sub-modules working in conjunction. +In an operational system executing varied use cases, not all modules in the SoC +need to function at their highest performing frequency all the time. To +facilitate this, sub-modules in a SoC are grouped into domains, allowing some +domains to run at lower voltage and frequency while other domains run at +voltage/frequency pairs that are higher. + +The set of discrete tuples consisting of frequency and voltage pairs that +the device will support per domain are called Operating Performance Points or +OPPs. + +As an example: + +Let us consider an MPU device which supports the following: +{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V}, +{1GHz at minimum voltage of 1.3V} + +We can represent these as three OPPs as the following {Hz, uV} tuples: + +- {300000000, 1000000} +- {800000000, 1200000} +- {1000000000, 1300000} + +1.2 Operating Performance Points Library +---------------------------------------- + +OPP library provides a set of helper functions to organize and query the OPP +information. The library is located in drivers/base/power/opp.c and the header +is located in include/linux/pm_opp.h. OPP library can be enabled by enabling +CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on +CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to +optionally boot at a certain OPP without needing cpufreq. + +Typical usage of the OPP library is as follows:: + + (users) -> registers a set of default OPPs -> (library) + SoC framework -> modifies on required cases certain OPPs -> OPP layer + -> queries to search/retrieve information -> + +OPP layer expects each domain to be represented by a unique device pointer. SoC +framework registers a set of initial OPPs per device with the OPP layer. This +list is expected to be an optimally small number typically around 5 per device. +This initial list contains a set of OPPs that the framework expects to be safely +enabled by default in the system. + +Note on OPP Availability +^^^^^^^^^^^^^^^^^^^^^^^^ + +As the system proceeds to operate, SoC framework may choose to make certain +OPPs available or not available on each device based on various external +factors. Example usage: Thermal management or other exceptional situations where +SoC framework might choose to disable a higher frequency OPP to safely continue +operations until that OPP could be re-enabled if possible. + +OPP library facilitates this concept in it's implementation. The following +operational functions operate only on available opps: +opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count + +dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then +be used for dev_pm_opp_enable/disable functions to make an opp available as required. + +WARNING: Users of OPP library should refresh their availability count using +get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the +exact mechanism to trigger these or the notification mechanism to other +dependent subsystems such as cpufreq are left to the discretion of the SoC +specific framework which uses the OPP library. Similar care needs to be taken +care to refresh the cpufreq table in cases of these operations. + +2. Initial OPP List Registration +================================ +The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per +device. It is expected that the SoC framework will register the OPP entries +optimally- typical numbers range to be less than 5. The list generated by +registering the OPPs is maintained by OPP library throughout the device +operation. The SoC framework can subsequently control the availability of the +OPPs dynamically using the dev_pm_opp_enable / disable functions. + +dev_pm_opp_add + Add a new OPP for a specific domain represented by the device pointer. + The OPP is defined using the frequency and voltage. Once added, the OPP + is assumed to be available and control of it's availability can be done + with the dev_pm_opp_enable/disable functions. OPP library internally stores + and manages this information in the opp struct. This function may be + used by SoC framework to define a optimal list as per the demands of + SoC usage environment. + + WARNING: + Do not use this function in interrupt context. + + Example:: + + soc_pm_init() + { + /* Do things */ + r = dev_pm_opp_add(mpu_dev, 1000000, 900000); + if (!r) { + pr_err("%s: unable to register mpu opp(%d)\n", r); + goto no_cpufreq; + } + /* Do cpufreq things */ + no_cpufreq: + /* Do remaining things */ + } + +3. OPP Search Functions +======================= +High level framework such as cpufreq operates on frequencies. To map the +frequency back to the corresponding OPP, OPP library provides handy functions +to search the OPP list that OPP library internally manages. These search +functions return the matching pointer representing the opp if a match is +found, else returns error. These errors are expected to be handled by standard +error checks such as IS_ERR() and appropriate actions taken by the caller. + +Callers of these functions shall call dev_pm_opp_put() after they have used the +OPP. Otherwise the memory for the OPP will never get freed and result in +memleak. + +dev_pm_opp_find_freq_exact + Search for an OPP based on an *exact* frequency and + availability. This function is especially useful to enable an OPP which + is not available by default. + Example: In a case when SoC framework detects a situation where a + higher frequency could be made available, it can use this function to + find the OPP prior to call the dev_pm_opp_enable to actually make + it available:: + + opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); + dev_pm_opp_put(opp); + /* dont operate on the pointer.. just do a sanity check.. */ + if (IS_ERR(opp)) { + pr_err("frequency not disabled!\n"); + /* trigger appropriate actions.. */ + } else { + dev_pm_opp_enable(dev,1000000000); + } + + NOTE: + This is the only search function that operates on OPPs which are + not available. + +dev_pm_opp_find_freq_floor + Search for an available OPP which is *at most* the + provided frequency. This function is useful while searching for a lesser + match OR operating on OPP information in the order of decreasing + frequency. + Example: To find the highest opp for a device:: + + freq = ULONG_MAX; + opp = dev_pm_opp_find_freq_floor(dev, &freq); + dev_pm_opp_put(opp); + +dev_pm_opp_find_freq_ceil + Search for an available OPP which is *at least* the + provided frequency. This function is useful while searching for a + higher match OR operating on OPP information in the order of increasing + frequency. + Example 1: To find the lowest opp for a device:: + + freq = 0; + opp = dev_pm_opp_find_freq_ceil(dev, &freq); + dev_pm_opp_put(opp); + + Example 2: A simplified implementation of a SoC cpufreq_driver->target:: + + soc_cpufreq_target(..) + { + /* Do stuff like policy checks etc. */ + /* Find the best frequency match for the req */ + opp = dev_pm_opp_find_freq_ceil(dev, &freq); + dev_pm_opp_put(opp); + if (!IS_ERR(opp)) + soc_switch_to_freq_voltage(freq); + else + /* do something when we can't satisfy the req */ + /* do other stuff */ + } + +4. OPP Availability Control Functions +===================================== +A default OPP list registered with the OPP library may not cater to all possible +situation. The OPP library provides a set of functions to modify the +availability of a OPP within the OPP list. This allows SoC frameworks to have +fine grained dynamic control of which sets of OPPs are operationally available. +These functions are intended to *temporarily* remove an OPP in conditions such +as thermal considerations (e.g. don't use OPPx until the temperature drops). + +WARNING: + Do not use these functions in interrupt context. + +dev_pm_opp_enable + Make a OPP available for operation. + Example: Lets say that 1GHz OPP is to be made available only if the + SoC temperature is lower than a certain threshold. The SoC framework + implementation might choose to do something as follows:: + + if (cur_temp < temp_low_thresh) { + /* Enable 1GHz if it was disabled */ + opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); + dev_pm_opp_put(opp); + /* just error check */ + if (!IS_ERR(opp)) + ret = dev_pm_opp_enable(dev, 1000000000); + else + goto try_something_else; + } + +dev_pm_opp_disable + Make an OPP to be not available for operation + Example: Lets say that 1GHz OPP is to be disabled if the temperature + exceeds a threshold value. The SoC framework implementation might + choose to do something as follows:: + + if (cur_temp > temp_high_thresh) { + /* Disable 1GHz if it was enabled */ + opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true); + dev_pm_opp_put(opp); + /* just error check */ + if (!IS_ERR(opp)) + ret = dev_pm_opp_disable(dev, 1000000000); + else + goto try_something_else; + } + +5. OPP Data Retrieval Functions +=============================== +Since OPP library abstracts away the OPP information, a set of functions to pull +information from the OPP structure is necessary. Once an OPP pointer is +retrieved using the search functions, the following functions can be used by SoC +framework to retrieve the information represented inside the OPP layer. + +dev_pm_opp_get_voltage + Retrieve the voltage represented by the opp pointer. + Example: At a cpufreq transition to a different frequency, SoC + framework requires to set the voltage represented by the OPP using + the regulator framework to the Power Management chip providing the + voltage:: + + soc_switch_to_freq_voltage(freq) + { + /* do things */ + opp = dev_pm_opp_find_freq_ceil(dev, &freq); + v = dev_pm_opp_get_voltage(opp); + dev_pm_opp_put(opp); + if (v) + regulator_set_voltage(.., v); + /* do other things */ + } + +dev_pm_opp_get_freq + Retrieve the freq represented by the opp pointer. + Example: Lets say the SoC framework uses a couple of helper functions + we could pass opp pointers instead of doing additional parameters to + handle quiet a bit of data parameters:: + + soc_cpufreq_target(..) + { + /* do things.. */ + max_freq = ULONG_MAX; + max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq); + requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq); + if (!IS_ERR(max_opp) && !IS_ERR(requested_opp)) + r = soc_test_validity(max_opp, requested_opp); + dev_pm_opp_put(max_opp); + dev_pm_opp_put(requested_opp); + /* do other things */ + } + soc_test_validity(..) + { + if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp)) + return -EINVAL; + if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp)) + return -EINVAL; + /* do things.. */ + } + +dev_pm_opp_get_opp_count + Retrieve the number of available opps for a device + Example: Lets say a co-processor in the SoC needs to know the available + frequencies in a table, the main processor can notify as following:: + + soc_notify_coproc_available_frequencies() + { + /* Do things */ + num_available = dev_pm_opp_get_opp_count(dev); + speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL); + /* populate the table in increasing order */ + freq = 0; + while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) { + speeds[i] = freq; + freq++; + i++; + dev_pm_opp_put(opp); + } + + soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available); + /* Do other things */ + } + +6. Data Structures +================== +Typically an SoC contains multiple voltage domains which are variable. Each +domain is represented by a device pointer. The relationship to OPP can be +represented as follows:: + + SoC + |- device 1 + | |- opp 1 (availability, freq, voltage) + | |- opp 2 .. + ... ... + | `- opp n .. + |- device 2 + ... + `- device m + +OPP library maintains a internal list that the SoC framework populates and +accessed by various functions as described above. However, the structures +representing the actual OPPs and domains are internal to the OPP library itself +to allow for suitable abstraction reusable across systems. + +struct dev_pm_opp + The internal data structure of OPP library which is used to + represent an OPP. In addition to the freq, voltage, availability + information, it also contains internal book keeping information required + for the OPP library to operate on. Pointer to this structure is + provided back to the users such as SoC framework to be used as a + identifier for OPP in the interactions with OPP layer. + + WARNING: + The struct dev_pm_opp pointer should not be parsed or modified by the + users. The defaults of for an instance is populated by + dev_pm_opp_add, but the availability of the OPP can be modified + by dev_pm_opp_enable/disable functions. + +struct device + This is used to identify a domain to the OPP layer. The + nature of the device and it's implementation is left to the user of + OPP library such as the SoC framework. + +Overall, in a simplistic view, the data structure operations is represented as +following:: + + Initialization / modification: + +-----+ /- dev_pm_opp_enable + dev_pm_opp_add --> | opp | <------- + | +-----+ \- dev_pm_opp_disable + \-------> domain_info(device) + + Search functions: + /-- dev_pm_opp_find_freq_ceil ---\ +-----+ + domain_info<---- dev_pm_opp_find_freq_exact -----> | opp | + \-- dev_pm_opp_find_freq_floor ---/ +-----+ + + Retrieval functions: + +-----+ /- dev_pm_opp_get_voltage + | opp | <--- + +-----+ \- dev_pm_opp_get_freq + + domain_info <- dev_pm_opp_get_opp_count diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt deleted file mode 100644 index 0c007e250cd1..000000000000 --- a/Documentation/power/opp.txt +++ /dev/null @@ -1,342 +0,0 @@ -Operating Performance Points (OPP) Library -========================================== - -(C) 2009-2010 Nishanth Menon , Texas Instruments Incorporated - -Contents --------- -1. Introduction -2. Initial OPP List Registration -3. OPP Search Functions -4. OPP Availability Control Functions -5. OPP Data Retrieval Functions -6. Data Structures - -1. Introduction -=============== -1.1 What is an Operating Performance Point (OPP)? - -Complex SoCs of today consists of a multiple sub-modules working in conjunction. -In an operational system executing varied use cases, not all modules in the SoC -need to function at their highest performing frequency all the time. To -facilitate this, sub-modules in a SoC are grouped into domains, allowing some -domains to run at lower voltage and frequency while other domains run at -voltage/frequency pairs that are higher. - -The set of discrete tuples consisting of frequency and voltage pairs that -the device will support per domain are called Operating Performance Points or -OPPs. - -As an example: -Let us consider an MPU device which supports the following: -{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V}, -{1GHz at minimum voltage of 1.3V} - -We can represent these as three OPPs as the following {Hz, uV} tuples: -{300000000, 1000000} -{800000000, 1200000} -{1000000000, 1300000} - -1.2 Operating Performance Points Library - -OPP library provides a set of helper functions to organize and query the OPP -information. The library is located in drivers/base/power/opp.c and the header -is located in include/linux/pm_opp.h. OPP library can be enabled by enabling -CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on -CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to -optionally boot at a certain OPP without needing cpufreq. - -Typical usage of the OPP library is as follows: -(users) -> registers a set of default OPPs -> (library) -SoC framework -> modifies on required cases certain OPPs -> OPP layer - -> queries to search/retrieve information -> - -OPP layer expects each domain to be represented by a unique device pointer. SoC -framework registers a set of initial OPPs per device with the OPP layer. This -list is expected to be an optimally small number typically around 5 per device. -This initial list contains a set of OPPs that the framework expects to be safely -enabled by default in the system. - -Note on OPP Availability: ------------------------- -As the system proceeds to operate, SoC framework may choose to make certain -OPPs available or not available on each device based on various external -factors. Example usage: Thermal management or other exceptional situations where -SoC framework might choose to disable a higher frequency OPP to safely continue -operations until that OPP could be re-enabled if possible. - -OPP library facilitates this concept in it's implementation. The following -operational functions operate only on available opps: -opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count - -dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then -be used for dev_pm_opp_enable/disable functions to make an opp available as required. - -WARNING: Users of OPP library should refresh their availability count using -get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the -exact mechanism to trigger these or the notification mechanism to other -dependent subsystems such as cpufreq are left to the discretion of the SoC -specific framework which uses the OPP library. Similar care needs to be taken -care to refresh the cpufreq table in cases of these operations. - -2. Initial OPP List Registration -================================ -The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per -device. It is expected that the SoC framework will register the OPP entries -optimally- typical numbers range to be less than 5. The list generated by -registering the OPPs is maintained by OPP library throughout the device -operation. The SoC framework can subsequently control the availability of the -OPPs dynamically using the dev_pm_opp_enable / disable functions. - -dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer. - The OPP is defined using the frequency and voltage. Once added, the OPP - is assumed to be available and control of it's availability can be done - with the dev_pm_opp_enable/disable functions. OPP library internally stores - and manages this information in the opp struct. This function may be - used by SoC framework to define a optimal list as per the demands of - SoC usage environment. - - WARNING: Do not use this function in interrupt context. - - Example: - soc_pm_init() - { - /* Do things */ - r = dev_pm_opp_add(mpu_dev, 1000000, 900000); - if (!r) { - pr_err("%s: unable to register mpu opp(%d)\n", r); - goto no_cpufreq; - } - /* Do cpufreq things */ - no_cpufreq: - /* Do remaining things */ - } - -3. OPP Search Functions -======================= -High level framework such as cpufreq operates on frequencies. To map the -frequency back to the corresponding OPP, OPP library provides handy functions -to search the OPP list that OPP library internally manages. These search -functions return the matching pointer representing the opp if a match is -found, else returns error. These errors are expected to be handled by standard -error checks such as IS_ERR() and appropriate actions taken by the caller. - -Callers of these functions shall call dev_pm_opp_put() after they have used the -OPP. Otherwise the memory for the OPP will never get freed and result in -memleak. - -dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and - availability. This function is especially useful to enable an OPP which - is not available by default. - Example: In a case when SoC framework detects a situation where a - higher frequency could be made available, it can use this function to - find the OPP prior to call the dev_pm_opp_enable to actually make it available. - opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); - dev_pm_opp_put(opp); - /* dont operate on the pointer.. just do a sanity check.. */ - if (IS_ERR(opp)) { - pr_err("frequency not disabled!\n"); - /* trigger appropriate actions.. */ - } else { - dev_pm_opp_enable(dev,1000000000); - } - - NOTE: This is the only search function that operates on OPPs which are - not available. - -dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the - provided frequency. This function is useful while searching for a lesser - match OR operating on OPP information in the order of decreasing - frequency. - Example: To find the highest opp for a device: - freq = ULONG_MAX; - opp = dev_pm_opp_find_freq_floor(dev, &freq); - dev_pm_opp_put(opp); - -dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the - provided frequency. This function is useful while searching for a - higher match OR operating on OPP information in the order of increasing - frequency. - Example 1: To find the lowest opp for a device: - freq = 0; - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - dev_pm_opp_put(opp); - Example 2: A simplified implementation of a SoC cpufreq_driver->target: - soc_cpufreq_target(..) - { - /* Do stuff like policy checks etc. */ - /* Find the best frequency match for the req */ - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - dev_pm_opp_put(opp); - if (!IS_ERR(opp)) - soc_switch_to_freq_voltage(freq); - else - /* do something when we can't satisfy the req */ - /* do other stuff */ - } - -4. OPP Availability Control Functions -===================================== -A default OPP list registered with the OPP library may not cater to all possible -situation. The OPP library provides a set of functions to modify the -availability of a OPP within the OPP list. This allows SoC frameworks to have -fine grained dynamic control of which sets of OPPs are operationally available. -These functions are intended to *temporarily* remove an OPP in conditions such -as thermal considerations (e.g. don't use OPPx until the temperature drops). - -WARNING: Do not use these functions in interrupt context. - -dev_pm_opp_enable - Make a OPP available for operation. - Example: Lets say that 1GHz OPP is to be made available only if the - SoC temperature is lower than a certain threshold. The SoC framework - implementation might choose to do something as follows: - if (cur_temp < temp_low_thresh) { - /* Enable 1GHz if it was disabled */ - opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); - dev_pm_opp_put(opp); - /* just error check */ - if (!IS_ERR(opp)) - ret = dev_pm_opp_enable(dev, 1000000000); - else - goto try_something_else; - } - -dev_pm_opp_disable - Make an OPP to be not available for operation - Example: Lets say that 1GHz OPP is to be disabled if the temperature - exceeds a threshold value. The SoC framework implementation might - choose to do something as follows: - if (cur_temp > temp_high_thresh) { - /* Disable 1GHz if it was enabled */ - opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true); - dev_pm_opp_put(opp); - /* just error check */ - if (!IS_ERR(opp)) - ret = dev_pm_opp_disable(dev, 1000000000); - else - goto try_something_else; - } - -5. OPP Data Retrieval Functions -=============================== -Since OPP library abstracts away the OPP information, a set of functions to pull -information from the OPP structure is necessary. Once an OPP pointer is -retrieved using the search functions, the following functions can be used by SoC -framework to retrieve the information represented inside the OPP layer. - -dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer. - Example: At a cpufreq transition to a different frequency, SoC - framework requires to set the voltage represented by the OPP using - the regulator framework to the Power Management chip providing the - voltage. - soc_switch_to_freq_voltage(freq) - { - /* do things */ - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - v = dev_pm_opp_get_voltage(opp); - dev_pm_opp_put(opp); - if (v) - regulator_set_voltage(.., v); - /* do other things */ - } - -dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer. - Example: Lets say the SoC framework uses a couple of helper functions - we could pass opp pointers instead of doing additional parameters to - handle quiet a bit of data parameters. - soc_cpufreq_target(..) - { - /* do things.. */ - max_freq = ULONG_MAX; - max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq); - requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq); - if (!IS_ERR(max_opp) && !IS_ERR(requested_opp)) - r = soc_test_validity(max_opp, requested_opp); - dev_pm_opp_put(max_opp); - dev_pm_opp_put(requested_opp); - /* do other things */ - } - soc_test_validity(..) - { - if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp)) - return -EINVAL; - if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp)) - return -EINVAL; - /* do things.. */ - } - -dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device - Example: Lets say a co-processor in the SoC needs to know the available - frequencies in a table, the main processor can notify as following: - soc_notify_coproc_available_frequencies() - { - /* Do things */ - num_available = dev_pm_opp_get_opp_count(dev); - speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL); - /* populate the table in increasing order */ - freq = 0; - while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) { - speeds[i] = freq; - freq++; - i++; - dev_pm_opp_put(opp); - } - - soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available); - /* Do other things */ - } - -6. Data Structures -================== -Typically an SoC contains multiple voltage domains which are variable. Each -domain is represented by a device pointer. The relationship to OPP can be -represented as follows: -SoC - |- device 1 - | |- opp 1 (availability, freq, voltage) - | |- opp 2 .. - ... ... - | `- opp n .. - |- device 2 - ... - `- device m - -OPP library maintains a internal list that the SoC framework populates and -accessed by various functions as described above. However, the structures -representing the actual OPPs and domains are internal to the OPP library itself -to allow for suitable abstraction reusable across systems. - -struct dev_pm_opp - The internal data structure of OPP library which is used to - represent an OPP. In addition to the freq, voltage, availability - information, it also contains internal book keeping information required - for the OPP library to operate on. Pointer to this structure is - provided back to the users such as SoC framework to be used as a - identifier for OPP in the interactions with OPP layer. - - WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the - users. The defaults of for an instance is populated by dev_pm_opp_add, but the - availability of the OPP can be modified by dev_pm_opp_enable/disable functions. - -struct device - This is used to identify a domain to the OPP layer. The - nature of the device and it's implementation is left to the user of - OPP library such as the SoC framework. - -Overall, in a simplistic view, the data structure operations is represented as -following: - -Initialization / modification: - +-----+ /- dev_pm_opp_enable -dev_pm_opp_add --> | opp | <------- - | +-----+ \- dev_pm_opp_disable - \-------> domain_info(device) - -Search functions: - /-- dev_pm_opp_find_freq_ceil ---\ +-----+ -domain_info<---- dev_pm_opp_find_freq_exact -----> | opp | - \-- dev_pm_opp_find_freq_floor ---/ +-----+ - -Retrieval functions: -+-----+ /- dev_pm_opp_get_voltage -| opp | <--- -+-----+ \- dev_pm_opp_get_freq - -domain_info <- dev_pm_opp_get_opp_count diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst new file mode 100644 index 000000000000..0e2ef7429304 --- /dev/null +++ b/Documentation/power/pci.rst @@ -0,0 +1,1135 @@ +==================== +PCI Power Management +==================== + +Copyright (c) 2010 Rafael J. Wysocki , Novell Inc. + +An overview of concepts and the Linux kernel's interfaces related to PCI power +management. Based on previous work by Patrick Mochel +(and others). + +This document only covers the aspects of power management specific to PCI +devices. For general description of the kernel's interfaces related to device +power management refer to Documentation/driver-api/pm/devices.rst and +Documentation/power/runtime_pm.rst. + +.. contents: + + 1. Hardware and Platform Support for PCI Power Management + 2. PCI Subsystem and Device Power Management + 3. PCI Device Drivers and Power Management + 4. Resources + + +1. Hardware and Platform Support for PCI Power Management +========================================================= + +1.1. Native and Platform-Based Power Management +----------------------------------------------- + +In general, power management is a feature allowing one to save energy by putting +devices into states in which they draw less power (low-power states) at the +price of reduced functionality or performance. + +Usually, a device is put into a low-power state when it is underutilized or +completely inactive. However, when it is necessary to use the device once +again, it has to be put back into the "fully functional" state (full-power +state). This may happen when there are some data for the device to handle or +as a result of an external event requiring the device to be active, which may +be signaled by the device itself. + +PCI devices may be put into low-power states in two ways, by using the device +capabilities introduced by the PCI Bus Power Management Interface Specification, +or with the help of platform firmware, such as an ACPI BIOS. In the first +approach, that is referred to as the native PCI power management (native PCI PM) +in what follows, the device power state is changed as a result of writing a +specific value into one of its standard configuration registers. The second +approach requires the platform firmware to provide special methods that may be +used by the kernel to change the device's power state. + +Devices supporting the native PCI PM usually can generate wakeup signals called +Power Management Events (PMEs) to let the kernel know about external events +requiring the device to be active. After receiving a PME the kernel is supposed +to put the device that sent it into the full-power state. However, the PCI Bus +Power Management Interface Specification doesn't define any standard method of +delivering the PME from the device to the CPU and the operating system kernel. +It is assumed that the platform firmware will perform this task and therefore, +even though a PCI device is set up to generate PMEs, it also may be necessary to +prepare the platform firmware for notifying the CPU of the PMEs coming from the +device (e.g. by generating interrupts). + +In turn, if the methods provided by the platform firmware are used for changing +the power state of a device, usually the platform also provides a method for +preparing the device to generate wakeup signals. In that case, however, it +often also is necessary to prepare the device for generating PMEs using the +native PCI PM mechanism, because the method provided by the platform depends on +that. + +Thus in many situations both the native and the platform-based power management +mechanisms have to be used simultaneously to obtain the desired result. + +1.2. Native PCI Power Management +-------------------------------- + +The PCI Bus Power Management Interface Specification (PCI PM Spec) was +introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a +standard interface for performing various operations related to power +management. + +The implementation of the PCI PM Spec is optional for conventional PCI devices, +but it is mandatory for PCI Express devices. If a device supports the PCI PM +Spec, it has an 8 byte power management capability field in its PCI +configuration space. This field is used to describe and control the standard +features related to the native PCI power management. + +The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses +(B0-B3). The higher the number, the less power is drawn by the device or bus +in that state. However, the higher the number, the longer the latency for +the device or bus to return to the full-power state (D0 or B0, respectively). + +There are two variants of the D3 state defined by the specification. The first +one is D3hot, referred to as the software accessible D3, because devices can be +programmed to go into it. The second one, D3cold, is the state that PCI devices +are in when the supply voltage (Vcc) is removed from them. It is not possible +to program a PCI device to go into D3cold, although there may be a programmable +interface for putting the bus the device is on into a state in which Vcc is +removed from all devices on the bus. + +PCI bus power management, however, is not supported by the Linux kernel at the +time of this writing and therefore it is not covered by this document. + +Note that every PCI device can be in the full-power state (D0) or in D3cold, +regardless of whether or not it implements the PCI PM Spec. In addition to +that, if the PCI PM Spec is implemented by the device, it must support D3hot +as well as D0. The support for the D1 and D2 power states is optional. + +PCI devices supporting the PCI PM Spec can be programmed to go to any of the +supported low-power states (except for D3cold). While in D1-D3hot the +standard configuration registers of the device must be accessible to software +(i.e. the device is required to respond to PCI configuration accesses), although +its I/O and memory spaces are then disabled. This allows the device to be +programmatically put into D0. Thus the kernel can switch the device back and +forth between D0 and the supported low-power states (except for D3cold) and the +possible power state transitions the device can undergo are the following: + ++----------------------------+ +| Current State | New State | ++----------------------------+ +| D0 | D1, D2, D3 | ++----------------------------+ +| D1 | D2, D3 | ++----------------------------+ +| D2 | D3 | ++----------------------------+ +| D1, D2, D3 | D0 | ++----------------------------+ + +The transition from D3cold to D0 occurs when the supply voltage is provided to +the device (i.e. power is restored). In that case the device returns to D0 with +a full power-on reset sequence and the power-on defaults are restored to the +device by hardware just as at initial power up. + +PCI devices supporting the PCI PM Spec can be programmed to generate PMEs +while in a low-power state (D1-D3), but they are not required to be capable +of generating PMEs from all supported low-power states. In particular, the +capability of generating PMEs from D3cold is optional and depends on the +presence of additional voltage (3.3Vaux) allowing the device to remain +sufficiently active to generate a wakeup signal. + +1.3. ACPI Device Power Management +--------------------------------- + +The platform firmware support for the power management of PCI devices is +system-specific. However, if the system in question is compliant with the +Advanced Configuration and Power Interface (ACPI) Specification, like the +majority of x86-based systems, it is supposed to implement device power +management interfaces defined by the ACPI standard. + +For this purpose the ACPI BIOS provides special functions called "control +methods" that may be executed by the kernel to perform specific tasks, such as +putting a device into a low-power state. These control methods are encoded +using special byte-code language called the ACPI Machine Language (AML) and +stored in the machine's BIOS. The kernel loads them from the BIOS and executes +them as needed using an AML interpreter that translates the AML byte code into +computations and memory or I/O space accesses. This way, in theory, a BIOS +writer can provide the kernel with a means to perform actions depending +on the system design in a system-specific fashion. + +ACPI control methods may be divided into global control methods, that are not +associated with any particular devices, and device control methods, that have +to be defined separately for each device supposed to be handled with the help of +the platform. This means, in particular, that ACPI device control methods can +only be used to handle devices that the BIOS writer knew about in advance. The +ACPI methods used for device power management fall into that category. + +The ACPI specification assumes that devices can be in one of four power states +labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM +D0-D3 states (although the difference between D3hot and D3cold is not taken +into account by ACPI). Moreover, for each power state of a device there is a +set of power resources that have to be enabled for the device to be put into +that state. These power resources are controlled (i.e. enabled or disabled) +with the help of their own control methods, _ON and _OFF, that have to be +defined individually for each of them. + +To put a device into the ACPI power state Dx (where x is a number between 0 and +3 inclusive) the kernel is supposed to (1) enable the power resources required +by the device in this state using their _ON control methods and (2) execute the +_PSx control method defined for the device. In addition to that, if the device +is going to be put into a low-power state (D1-D3) and is supposed to generate +wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI +3.0) control method defined for it has to be executed before _PSx. Power +resources that are not required by the device in the target power state and are +not required any more by any other device should be disabled (by executing their +_OFF control methods). If the current power state of the device is D3, it can +only be put into D0 this way. + +However, quite often the power states of devices are changed during a +system-wide transition into a sleep state or back into the working state. ACPI +defines four system sleep states, S1, S2, S3, and S4, and denotes the system +working state as S0. In general, the target system sleep (or working) state +determines the highest power (lowest number) state the device can be put +into and the kernel is supposed to obtain this information by executing the +device's _SxD control method (where x is a number between 0 and 4 inclusive). +If the device is required to wake up the system from the target sleep state, the +lowest power (highest number) state it can be put into is also determined by the +target state of the system. The kernel is then supposed to use the device's +_SxW control method to obtain the number of that state. It also is supposed to +use the device's _PRW control method to learn which power resources need to be +enabled for the device to be able to generate wakeup signals. + +1.4. Wakeup Signaling +--------------------- + +Wakeup signals generated by PCI devices, either as native PCI PMEs, or as +a result of the execution of the _DSW (or _PSW) ACPI control method before +putting the device into a low-power state, have to be caught and handled as +appropriate. If they are sent while the system is in the working state +(ACPI S0), they should be translated into interrupts so that the kernel can +put the devices generating them into the full-power state and take care of the +events that triggered them. In turn, if they are sent while the system is +sleeping, they should cause the system's core logic to trigger wakeup. + +On ACPI-based systems wakeup signals sent by conventional PCI devices are +converted into ACPI General-Purpose Events (GPEs) which are hardware signals +from the system core logic generated in response to various events that need to +be acted upon. Every GPE is associated with one or more sources of potentially +interesting events. In particular, a GPE may be associated with a PCI device +capable of signaling wakeup. The information on the connections between GPEs +and event sources is recorded in the system's ACPI BIOS from where it can be +read by the kernel. + +If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE +associated with it (if there is one) is triggered. The GPEs associated with PCI +bridges may also be triggered in response to a wakeup signal from one of the +devices below the bridge (this also is the case for root bridges) and, for +example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be +handled this way. + +A GPE may be triggered when the system is sleeping (i.e. when it is in one of +the ACPI S1-S4 states), in which case system wakeup is started by its core logic +(the device that was the source of the signal causing the system wakeup to occur +may be identified later). The GPEs used in such situations are referred to as +wakeup GPEs. + +Usually, however, GPEs are also triggered when the system is in the working +state (ACPI S0) and in that case the system's core logic generates a System +Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI +handler identifies the GPE that caused the interrupt to be generated which, +in turn, allows the kernel to identify the source of the event (that may be +a PCI device signaling wakeup). The GPEs used for notifying the kernel of +events occurring while the system is in the working state are referred to as +runtime GPEs. + +Unfortunately, there is no standard way of handling wakeup signals sent by +conventional PCI devices on systems that are not ACPI-based, but there is one +for PCI Express devices. Namely, the PCI Express Base Specification introduced +a native mechanism for converting native PCI PMEs into interrupts generated by +root ports. For conventional PCI devices native PMEs are out-of-band, so they +are routed separately and they need not pass through bridges (in principle they +may be routed directly to the system's core logic), but for PCI Express devices +they are in-band messages that have to pass through the PCI Express hierarchy, +including the root port on the path from the device to the Root Complex. Thus +it was possible to introduce a mechanism by which a root port generates an +interrupt whenever it receives a PME message from one of the devices below it. +The PCI Express Requester ID of the device that sent the PME message is then +recorded in one of the root port's configuration registers from where it may be +read by the interrupt handler allowing the device to be identified. [PME +messages sent by PCI Express endpoints integrated with the Root Complex don't +pass through root ports, but instead they cause a Root Complex Event Collector +(if there is one) to generate interrupts.] + +In principle the native PCI Express PME signaling may also be used on ACPI-based +systems along with the GPEs, but to use it the kernel has to ask the system's +ACPI BIOS to release control of root port configuration registers. The ACPI +BIOS, however, is not required to allow the kernel to control these registers +and if it doesn't do that, the kernel must not modify their contents. Of course +the native PCI Express PME signaling cannot be used by the kernel in that case. + + +2. PCI Subsystem and Device Power Management +============================================ + +2.1. Device Power Management Callbacks +-------------------------------------- + +The PCI Subsystem participates in the power management of PCI devices in a +number of ways. First of all, it provides an intermediate code layer between +the device power management core (PM core) and PCI device drivers. +Specifically, the pm field of the PCI subsystem's struct bus_type object, +pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing +pointers to several device power management callbacks:: + + const struct dev_pm_ops pci_dev_pm_ops = { + .prepare = pci_pm_prepare, + .complete = pci_pm_complete, + .suspend = pci_pm_suspend, + .resume = pci_pm_resume, + .freeze = pci_pm_freeze, + .thaw = pci_pm_thaw, + .poweroff = pci_pm_poweroff, + .restore = pci_pm_restore, + .suspend_noirq = pci_pm_suspend_noirq, + .resume_noirq = pci_pm_resume_noirq, + .freeze_noirq = pci_pm_freeze_noirq, + .thaw_noirq = pci_pm_thaw_noirq, + .poweroff_noirq = pci_pm_poweroff_noirq, + .restore_noirq = pci_pm_restore_noirq, + .runtime_suspend = pci_pm_runtime_suspend, + .runtime_resume = pci_pm_runtime_resume, + .runtime_idle = pci_pm_runtime_idle, + }; + +These callbacks are executed by the PM core in various situations related to +device power management and they, in turn, execute power management callbacks +provided by PCI device drivers. They also perform power management operations +involving some standard configuration registers of PCI devices that device +drivers need not know or care about. + +The structure representing a PCI device, struct pci_dev, contains several fields +that these callbacks operate on:: + + struct pci_dev { + ... + pci_power_t current_state; /* Current operating state. */ + int pm_cap; /* PM capability offset in the + configuration space */ + unsigned int pme_support:5; /* Bitmask of states from which PME# + can be generated */ + unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */ + unsigned int d1_support:1; /* Low power state D1 is supported */ + unsigned int d2_support:1; /* Low power state D2 is supported */ + unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ + unsigned int wakeup_prepared:1; /* Device prepared for wake up */ + unsigned int d3_delay; /* D3->D0 transition time in ms */ + ... + }; + +They also indirectly use some fields of the struct device that is embedded in +struct pci_dev. + +2.2. Device Initialization +-------------------------- + +The PCI subsystem's first task related to device power management is to +prepare the device for power management and initialize the fields of struct +pci_dev used for this purpose. This happens in two functions defined in +drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). + +The first of these functions checks if the device supports native PCI PM +and if that's the case the offset of its power management capability structure +in the configuration space is stored in the pm_cap field of the device's struct +pci_dev object. Next, the function checks which PCI low-power states are +supported by the device and from which low-power states the device can generate +native PCI PMEs. The power management fields of the device's struct pci_dev and +the struct device embedded in it are updated accordingly and the generation of +PMEs by the device is disabled. + +The second function checks if the device can be prepared to signal wakeup with +the help of the platform firmware, such as the ACPI BIOS. If that is the case, +the function updates the wakeup fields in struct device embedded in the +device's struct pci_dev and uses the firmware-provided method to prevent the +device from signaling wakeup. + +At this point the device is ready for power management. For driverless devices, +however, this functionality is limited to a few basic operations carried out +during system-wide transitions to a sleep state and back to the working state. + +2.3. Runtime Device Power Management +------------------------------------ + +The PCI subsystem plays a vital role in the runtime power management of PCI +devices. For this purpose it uses the general runtime power management +(runtime PM) framework described in Documentation/power/runtime_pm.rst. +Namely, it provides subsystem-level callbacks:: + + pci_pm_runtime_suspend() + pci_pm_runtime_resume() + pci_pm_runtime_idle() + +that are executed by the core runtime PM routines. It also implements the +entire mechanics necessary for handling runtime wakeup signals from PCI devices +in low-power states, which at the time of this writing works for both the native +PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in +Section 1. + +First, a PCI device is put into a low-power state, or suspended, with the help +of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call +pci_pm_runtime_suspend() to do the actual job. For this to work, the device's +driver has to provide a pm->runtime_suspend() callback (see below), which is +run by pci_pm_runtime_suspend() as the first action. If the driver's callback +returns successfully, the device's standard configuration registers are saved, +the device is prepared to generate wakeup signals and, finally, it is put into +the target low-power state. + +The low-power state to put the device into is the lowest-power (highest number) +state from which it can signal wakeup. The exact method of signaling wakeup is +system-dependent and is determined by the PCI subsystem on the basis of the +reported capabilities of the device and the platform firmware. To prepare the +device for signaling wakeup and put it into the selected low-power state, the +PCI subsystem can use the platform firmware as well as the device's native PCI +PM capabilities, if supported. + +It is expected that the device driver's pm->runtime_suspend() callback will +not attempt to prepare the device for signaling wakeup or to put it into a +low-power state. The driver ought to leave these tasks to the PCI subsystem +that has all of the information necessary to perform them. + +A suspended device is brought back into the "active" state, or resumed, +with the help of pm_request_resume() or pm_runtime_resume() which both call +pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's +driver provides a pm->runtime_resume() callback (see below). However, before +the driver's callback is executed, pci_pm_runtime_resume() brings the device +back into the full-power state, prevents it from signaling wakeup while in that +state and restores its standard configuration registers. Thus the driver's +callback need not worry about the PCI-specific aspects of the device resume. + +Note that generally pci_pm_runtime_resume() may be called in two different +situations. First, it may be called at the request of the device's driver, for +example if there are some data for it to process. Second, it may be called +as a result of a wakeup signal from the device itself (this sometimes is +referred to as "remote wakeup"). Of course, for this purpose the wakeup signal +is handled in one of the ways described in Section 1 and finally converted into +a notification for the PCI subsystem after the source device has been +identified. + +The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle() +and pm_request_idle(), executes the device driver's pm->runtime_idle() +callback, if defined, and if that callback doesn't return error code (or is not +present at all), suspends the device with the help of pm_runtime_suspend(). +Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for +example, it is called right after the device has just been resumed), in which +cases it is expected to suspend the device if that makes sense. Usually, +however, the PCI subsystem doesn't really know if the device really can be +suspended, so it lets the device's driver decide by running its +pm->runtime_idle() callback. + +2.4. System-Wide Power Transitions +---------------------------------- +There are a few different types of system-wide power transitions, described in +Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled +in a specific way and the PM core executes subsystem-level power management +callbacks for this purpose. They are executed in phases such that each phase +involves executing the same subsystem-level callback for every device belonging +to the given subsystem before the next phase begins. These phases always run +after tasks have been frozen. + +2.4.1. System Suspend +^^^^^^^^^^^^^^^^^^^^^ + +When the system is going into a sleep state in which the contents of memory will +be preserved, such as one of the ACPI sleep states S1-S3, the phases are: + + prepare, suspend, suspend_noirq. + +The following PCI bus type's callbacks, respectively, are used in these phases:: + + pci_pm_prepare() + pci_pm_suspend() + pci_pm_suspend_noirq() + +The pci_pm_prepare() routine first puts the device into the "fully functional" +state with the help of pm_runtime_resume(). Then, it executes the device +driver's pm->prepare() callback if defined (i.e. if the driver's struct +dev_pm_ops object is present and the prepare pointer in that object is valid). + +The pci_pm_suspend() routine first checks if the device's driver implements +legacy PCI suspend routines (see Section 3), in which case the driver's legacy +suspend callback is executed, if present, and its result is returned. Next, if +the device's driver doesn't provide a struct dev_pm_ops object (containing +pointers to the driver's callbacks), pci_pm_default_suspend() is called, which +simply turns off the device's bus master capability and runs +pcibios_disable_device() to disable it, unless the device is a bridge (PCI +bridges are ignored by this routine). Next, the device driver's pm->suspend() +callback is executed, if defined, and its result is returned if it fails. +Finally, pci_fixup_device() is called to apply hardware suspend quirks related +to the device if necessary. + +Note that the suspend phase is carried out asynchronously for PCI devices, so +the pci_pm_suspend() callback may be executed in parallel for any pair of PCI +devices that don't depend on each other in a known way (i.e. none of the paths +in the device tree from the root bridge to a leaf device contains both of them). + +The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has +been called, which means that the device driver's interrupt handler won't be +invoked while this routine is running. It first checks if the device's driver +implements legacy PCI suspends routines (Section 3), in which case the legacy +late suspend routine is called and its result is returned (the standard +configuration registers of the device are saved if the driver's callback hasn't +done that). Second, if the device driver's struct dev_pm_ops object is not +present, the device's standard configuration registers are saved and the routine +returns success. Otherwise the device driver's pm->suspend_noirq() callback is +executed, if present, and its result is returned if it fails. Next, if the +device's standard configuration registers haven't been saved yet (one of the +device driver's callbacks executed before might do that), pci_pm_suspend_noirq() +saves them, prepares the device to signal wakeup (if necessary) and puts it into +a low-power state. + +The low-power state to put the device into is the lowest-power (highest number) +state from which it can signal wakeup while the system is in the target sleep +state. Just like in the runtime PM case described above, the mechanism of +signaling wakeup is system-dependent and determined by the PCI subsystem, which +is also responsible for preparing the device to signal wakeup from the system's +target sleep state as appropriate. + +PCI device drivers (that don't implement legacy power management callbacks) are +generally not expected to prepare devices for signaling wakeup or to put them +into low-power states. However, if one of the driver's suspend callbacks +(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration +registers, pci_pm_suspend_noirq() will assume that the device has been prepared +to signal wakeup and put into a low-power state by the driver (the driver is +then assumed to have used the helper functions provided by the PCI subsystem for +this purpose). PCI device drivers are not encouraged to do that, but in some +rare cases doing that in the driver may be the optimum approach. + +2.4.2. System Resume +^^^^^^^^^^^^^^^^^^^^ + +When the system is undergoing a transition from a sleep state in which the +contents of memory have been preserved, such as one of the ACPI sleep states +S1-S3, into the working state (ACPI S0), the phases are: + + resume_noirq, resume, complete. + +The following PCI bus type's callbacks, respectively, are executed in these +phases:: + + pci_pm_resume_noirq() + pci_pm_resume() + pci_pm_complete() + +The pci_pm_resume_noirq() routine first puts the device into the full-power +state, restores its standard configuration registers and applies early resume +hardware quirks related to the device, if necessary. This is done +unconditionally, regardless of whether or not the device's driver implements +legacy PCI power management callbacks (this way all PCI devices are in the +full-power state and their standard configuration registers have been restored +when their interrupt handlers are invoked for the first time during resume, +which allows the kernel to avoid problems with the handling of shared interrupts +by drivers whose devices are still suspended). If legacy PCI power management +callbacks (see Section 3) are implemented by the device's driver, the legacy +early resume callback is executed and its result is returned. Otherwise, the +device driver's pm->resume_noirq() callback is executed, if defined, and its +result is returned. + +The pci_pm_resume() routine first checks if the device's standard configuration +registers have been restored and restores them if that's not the case (this +only is necessary in the error path during a failing suspend). Next, resume +hardware quirks related to the device are applied, if necessary, and if the +device's driver implements legacy PCI power management callbacks (see +Section 3), the driver's legacy resume callback is executed and its result is +returned. Otherwise, the device's wakeup signaling mechanisms are blocked and +its driver's pm->resume() callback is executed, if defined (the callback's +result is then returned). + +The resume phase is carried out asynchronously for PCI devices, like the +suspend phase described above, which means that if two PCI devices don't depend +on each other in a known way, the pci_pm_resume() routine may be executed for +the both of them in parallel. + +The pci_pm_complete() routine only executes the device driver's pm->complete() +callback, if defined. + +2.4.3. System Hibernation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +System hibernation is more complicated than system suspend, because it requires +a system image to be created and written into a persistent storage medium. The +image is created atomically and all devices are quiesced, or frozen, before that +happens. + +The freezing of devices is carried out after enough memory has been freed (at +the time of this writing the image creation requires at least 50% of system RAM +to be free) in the following three phases: + + prepare, freeze, freeze_noirq + +that correspond to the PCI bus type's callbacks:: + + pci_pm_prepare() + pci_pm_freeze() + pci_pm_freeze_noirq() + +This means that the prepare phase is exactly the same as for system suspend. +The other two phases, however, are different. + +The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs +the device driver's pm->freeze() callback, if defined, instead of pm->suspend(), +and it doesn't apply the suspend-related hardware quirks. It is executed +asynchronously for different PCI devices that don't depend on each other in a +known way. + +The pci_pm_freeze_noirq() routine, in turn, is similar to +pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq() +routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the +device for signaling wakeup and put it into a low-power state. Still, it saves +the device's standard configuration registers if they haven't been saved by one +of the driver's callbacks. + +Once the image has been created, it has to be saved. However, at this point all +devices are frozen and they cannot handle I/O, while their ability to handle +I/O is obviously necessary for the image saving. Thus they have to be brought +back to the fully functional state and this is done in the following phases: + + thaw_noirq, thaw, complete + +using the following PCI bus type's callbacks:: + + pci_pm_thaw_noirq() + pci_pm_thaw() + pci_pm_complete() + +respectively. + +The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(), +but it doesn't put the device into the full power state and doesn't attempt to +restore its standard configuration registers. It also executes the device +driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq(). + +The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device +driver's pm->thaw() callback instead of pm->resume(). It is executed +asynchronously for different PCI devices that don't depend on each other in a +known way. + +The complete phase it the same as for system resume. + +After saving the image, devices need to be powered down before the system can +enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in +three phases: + + prepare, poweroff, poweroff_noirq + +where the prepare phase is exactly the same as for system suspend. The other +two phases are analogous to the suspend and suspend_noirq phases, respectively. +The PCI subsystem-level callbacks they correspond to:: + + pci_pm_poweroff() + pci_pm_poweroff_noirq() + +work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively, +although they don't attempt to save the device's standard configuration +registers. + +2.4.4. System Restore +^^^^^^^^^^^^^^^^^^^^^ + +System restore requires a hibernation image to be loaded into memory and the +pre-hibernation memory contents to be restored before the pre-hibernation system +activity can be resumed. + +As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded +into memory by a fresh instance of the kernel, called the boot kernel, which in +turn is loaded and run by a boot loader in the usual way. After the boot kernel +has loaded the image, it needs to replace its own code and data with the code +and data of the "hibernated" kernel stored within the image, called the image +kernel. For this purpose all devices are frozen just like before creating +the image during hibernation, in the + + prepare, freeze, freeze_noirq + +phases described above. However, the devices affected by these phases are only +those having drivers in the boot kernel; other devices will still be in whatever +state the boot loader left them. + +Should the restoration of the pre-hibernation memory contents fail, the boot +kernel would go through the "thawing" procedure described above, using the +thaw_noirq, thaw, and complete phases (that will only affect the devices having +drivers in the boot kernel), and then continue running normally. + +If the pre-hibernation memory contents are restored successfully, which is the +usual situation, control is passed to the image kernel, which then becomes +responsible for bringing the system back to the working state. To achieve this, +it must restore the devices' pre-hibernation functionality, which is done much +like waking up from the memory sleep state, although it involves different +phases: + + restore_noirq, restore, complete + +The first two of these are analogous to the resume_noirq and resume phases +described above, respectively, and correspond to the following PCI subsystem +callbacks:: + + pci_pm_restore_noirq() + pci_pm_restore() + +These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(), +respectively, but they execute the device driver's pm->restore_noirq() and +pm->restore() callbacks, if available. + +The complete phase is carried out in exactly the same way as during system +resume. + + +3. PCI Device Drivers and Power Management +========================================== + +3.1. Power Management Callbacks +------------------------------- + +PCI device drivers participate in power management by providing callbacks to be +executed by the PCI subsystem's power management routines described above and by +controlling the runtime power management of their devices. + +At the time of this writing there are two ways to define power management +callbacks for a PCI device driver, the recommended one, based on using a +dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the +"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and +.resume() callbacks from struct pci_driver are used. The legacy approach, +however, doesn't allow one to define runtime power management callbacks and is +not really suitable for any new drivers. Therefore it is not covered by this +document (refer to the source code to learn more about it). + +It is recommended that all PCI device drivers define a struct dev_pm_ops object +containing pointers to power management (PM) callbacks that will be executed by +the PCI subsystem's PM routines in various circumstances. A pointer to the +driver's struct dev_pm_ops object has to be assigned to the driver.pm field in +its struct pci_driver object. Once that has happened, the "legacy" PM callbacks +in struct pci_driver are ignored (even if they are not NULL). + +The PM callbacks in struct dev_pm_ops are not mandatory and if they are not +defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI +subsystem will handle the device in a simplified default manner. If they are +defined, though, they are expected to behave as described in the following +subsections. + +3.1.1. prepare() +^^^^^^^^^^^^^^^^ + +The prepare() callback is executed during system suspend, during hibernation +(when a hibernation image is about to be created), during power-off after +saving a hibernation image and during system restore, when a hibernation image +has just been loaded into memory. + +This callback is only necessary if the driver's device has children that in +general may be registered at any time. In that case the role of the prepare() +callback is to prevent new children of the device from being registered until +one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run. + +In addition to that the prepare() callback may carry out some operations +preparing the device to be suspended, although it should not allocate memory +(if additional memory is required to suspend the device, it has to be +preallocated earlier, for example in a suspend/hibernate notifier as described +in Documentation/driver-api/pm/notifiers.rst). + +3.1.2. suspend() +^^^^^^^^^^^^^^^^ + +The suspend() callback is only executed during system suspend, after prepare() +callbacks have been executed for all devices in the system. + +This callback is expected to quiesce the device and prepare it to be put into a +low-power state by the PCI subsystem. It is not required (in fact it even is +not recommended) that a PCI driver's suspend() callback save the standard +configuration registers of the device, prepare it for waking up the system, or +put it into a low-power state. All of these operations can very well be taken +care of by the PCI subsystem, without the driver's participation. + +However, in some rare case it is convenient to carry out these operations in +a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and +pci_set_power_state() should be used to save the device's standard configuration +registers, to prepare it for system wakeup (if necessary), and to put it into a +low-power state, respectively. Moreover, if the driver calls pci_save_state(), +the PCI subsystem will not execute either pci_prepare_to_sleep(), or +pci_set_power_state() for its device, so the driver is then responsible for +handling the device as appropriate. + +While the suspend() callback is being executed, the driver's interrupt handler +can be invoked to handle an interrupt from the device, so all suspend-related +operations relying on the driver's ability to handle interrupts should be +carried out in this callback. + +3.1.3. suspend_noirq() +^^^^^^^^^^^^^^^^^^^^^^ + +The suspend_noirq() callback is only executed during system suspend, after +suspend() callbacks have been executed for all devices in the system and +after device interrupts have been disabled by the PM core. + +The difference between suspend_noirq() and suspend() is that the driver's +interrupt handler will not be invoked while suspend_noirq() is running. Thus +suspend_noirq() can carry out operations that would cause race conditions to +arise if they were performed in suspend(). + +3.1.4. freeze() +^^^^^^^^^^^^^^^ + +The freeze() callback is hibernation-specific and is executed in two situations, +during hibernation, after prepare() callbacks have been executed for all devices +in preparation for the creation of a system image, and during restore, +after a system image has been loaded into memory from persistent storage and the +prepare() callbacks have been executed for all devices. + +The role of this callback is analogous to the role of the suspend() callback +described above. In fact, they only need to be different in the rare cases when +the driver takes the responsibility for putting the device into a low-power +state. + +In that cases the freeze() callback should not prepare the device system wakeup +or put it into a low-power state. Still, either it or freeze_noirq() should +save the device's standard configuration registers using pci_save_state(). + +3.1.5. freeze_noirq() +^^^^^^^^^^^^^^^^^^^^^ + +The freeze_noirq() callback is hibernation-specific. It is executed during +hibernation, after prepare() and freeze() callbacks have been executed for all +devices in preparation for the creation of a system image, and during restore, +after a system image has been loaded into memory and after prepare() and +freeze() callbacks have been executed for all devices. It is always executed +after device interrupts have been disabled by the PM core. + +The role of this callback is analogous to the role of the suspend_noirq() +callback described above and it very rarely is necessary to define +freeze_noirq(). + +The difference between freeze_noirq() and freeze() is analogous to the +difference between suspend_noirq() and suspend(). + +3.1.6. poweroff() +^^^^^^^^^^^^^^^^^ + +The poweroff() callback is hibernation-specific. It is executed when the system +is about to be powered off after saving a hibernation image to a persistent +storage. prepare() callbacks are executed for all devices before poweroff() is +called. + +The role of this callback is analogous to the role of the suspend() and freeze() +callbacks described above, although it does not need to save the contents of +the device's registers. In particular, if the driver wants to put the device +into a low-power state itself instead of allowing the PCI subsystem to do that, +the poweroff() callback should use pci_prepare_to_sleep() and +pci_set_power_state() to prepare the device for system wakeup and to put it +into a low-power state, respectively, but it need not save the device's standard +configuration registers. + +3.1.7. poweroff_noirq() +^^^^^^^^^^^^^^^^^^^^^^^ + +The poweroff_noirq() callback is hibernation-specific. It is executed after +poweroff() callbacks have been executed for all devices in the system. + +The role of this callback is analogous to the role of the suspend_noirq() and +freeze_noirq() callbacks described above, but it does not need to save the +contents of the device's registers. + +The difference between poweroff_noirq() and poweroff() is analogous to the +difference between suspend_noirq() and suspend(). + +3.1.8. resume_noirq() +^^^^^^^^^^^^^^^^^^^^^ + +The resume_noirq() callback is only executed during system resume, after the +PM core has enabled the non-boot CPUs. The driver's interrupt handler will not +be invoked while resume_noirq() is running, so this callback can carry out +operations that might race with the interrupt handler. + +Since the PCI subsystem unconditionally puts all devices into the full power +state in the resume_noirq phase of system resume and restores their standard +configuration registers, resume_noirq() is usually not necessary. In general +it should only be used for performing operations that would lead to race +conditions if carried out by resume(). + +3.1.9. resume() +^^^^^^^^^^^^^^^ + +The resume() callback is only executed during system resume, after +resume_noirq() callbacks have been executed for all devices in the system and +device interrupts have been enabled by the PM core. + +This callback is responsible for restoring the pre-suspend configuration of the +device and bringing it back to the fully functional state. The device should be +able to process I/O in a usual way after resume() has returned. + +3.1.10. thaw_noirq() +^^^^^^^^^^^^^^^^^^^^ + +The thaw_noirq() callback is hibernation-specific. It is executed after a +system image has been created and the non-boot CPUs have been enabled by the PM +core, in the thaw_noirq phase of hibernation. It also may be executed if the +loading of a hibernation image fails during system restore (it is then executed +after enabling the non-boot CPUs). The driver's interrupt handler will not be +invoked while thaw_noirq() is running. + +The role of this callback is analogous to the role of resume_noirq(). The +difference between these two callbacks is that thaw_noirq() is executed after +freeze() and freeze_noirq(), so in general it does not need to modify the +contents of the device's registers. + +3.1.11. thaw() +^^^^^^^^^^^^^^ + +The thaw() callback is hibernation-specific. It is executed after thaw_noirq() +callbacks have been executed for all devices in the system and after device +interrupts have been enabled by the PM core. + +This callback is responsible for restoring the pre-freeze configuration of +the device, so that it will work in a usual way after thaw() has returned. + +3.1.12. restore_noirq() +^^^^^^^^^^^^^^^^^^^^^^^ + +The restore_noirq() callback is hibernation-specific. It is executed in the +restore_noirq phase of hibernation, when the boot kernel has passed control to +the image kernel and the non-boot CPUs have been enabled by the image kernel's +PM core. + +This callback is analogous to resume_noirq() with the exception that it cannot +make any assumption on the previous state of the device, even if the BIOS (or +generally the platform firmware) is known to preserve that state over a +suspend-resume cycle. + +For the vast majority of PCI device drivers there is no difference between +resume_noirq() and restore_noirq(). + +3.1.13. restore() +^^^^^^^^^^^^^^^^^ + +The restore() callback is hibernation-specific. It is executed after +restore_noirq() callbacks have been executed for all devices in the system and +after the PM core has enabled device drivers' interrupt handlers to be invoked. + +This callback is analogous to resume(), just like restore_noirq() is analogous +to resume_noirq(). Consequently, the difference between restore_noirq() and +restore() is analogous to the difference between resume_noirq() and resume(). + +For the vast majority of PCI device drivers there is no difference between +resume() and restore(). + +3.1.14. complete() +^^^^^^^^^^^^^^^^^^ + +The complete() callback is executed in the following situations: + + - during system resume, after resume() callbacks have been executed for all + devices, + - during hibernation, before saving the system image, after thaw() callbacks + have been executed for all devices, + - during system restore, when the system is going back to its pre-hibernation + state, after restore() callbacks have been executed for all devices. + +It also may be executed if the loading of a hibernation image into memory fails +(in that case it is run after thaw() callbacks have been executed for all +devices that have drivers in the boot kernel). + +This callback is entirely optional, although it may be necessary if the +prepare() callback performs operations that need to be reversed. + +3.1.15. runtime_suspend() +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The runtime_suspend() callback is specific to device runtime power management +(runtime PM). It is executed by the PM core's runtime PM framework when the +device is about to be suspended (i.e. quiesced and put into a low-power state) +at run time. + +This callback is responsible for freezing the device and preparing it to be +put into a low-power state, but it must allow the PCI subsystem to perform all +of the PCI-specific actions necessary for suspending the device. + +3.1.16. runtime_resume() +^^^^^^^^^^^^^^^^^^^^^^^^ + +The runtime_resume() callback is specific to device runtime PM. It is executed +by the PM core's runtime PM framework when the device is about to be resumed +(i.e. put into the full-power state and programmed to process I/O normally) at +run time. + +This callback is responsible for restoring the normal functionality of the +device after it has been put into the full-power state by the PCI subsystem. +The device is expected to be able to process I/O in the usual way after +runtime_resume() has returned. + +3.1.17. runtime_idle() +^^^^^^^^^^^^^^^^^^^^^^ + +The runtime_idle() callback is specific to device runtime PM. It is executed +by the PM core's runtime PM framework whenever it may be desirable to suspend +the device according to the PM core's information. In particular, it is +automatically executed right after runtime_resume() has returned in case the +resume of the device has happened as a result of a spurious event. + +This callback is optional, but if it is not implemented or if it returns 0, the +PCI subsystem will call pm_runtime_suspend() for the device, which in turn will +cause the driver's runtime_suspend() callback to be executed. + +3.1.18. Pointing Multiple Callback Pointers to One Routine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Although in principle each of the callbacks described in the previous +subsections can be defined as a separate function, it often is convenient to +point two or more members of struct dev_pm_ops to the same routine. There are +a few convenience macros that can be used for this purpose. + +The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one +suspend routine pointed to by the .suspend(), .freeze(), and .poweroff() +members and one resume routine pointed to by the .resume(), .thaw(), and +.restore() members. The other function pointers in this struct dev_pm_ops are +unset. + +The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it +additionally sets the .runtime_resume() pointer to the same value as +.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to +the same value as .suspend() (and .freeze() and .poweroff()). + +The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct +dev_pm_ops to indicate that one suspend routine is to be pointed to by the +.suspend(), .freeze(), and .poweroff() members and one resume routine is to +be pointed to by the .resume(), .thaw(), and .restore() members. + +3.1.19. Driver Flags for Power Management +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The PM core allows device drivers to set flags that influence the handling of +power management for the devices by the core itself and by middle layer code +including the PCI bus type. The flags should be set once at the driver probe +time with the help of the dev_pm_set_driver_flags() function and they should not +be updated directly afterwards. + +The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete +mechanism allowing device suspend/resume callbacks to be skipped if the device +is in runtime suspend when the system suspend starts. That also affects all of +the ancestors of the device, so this flag should only be used if absolutely +necessary. + +The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a +positive value from pci_pm_prepare() if the ->prepare callback provided by the +driver of the device returns a positive value. That allows the driver to opt +out from using the direct-complete mechanism dynamically. + +The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's +perspective the device can be safely left in runtime suspend during system +suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff() +to skip resuming the device from runtime suspend unless there are PCI-specific +reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(), +pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early +if the device remains in runtime suspend in the beginning of the "late" phase +of the system-wide transition under way. Moreover, if the device is in +runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime +power management status will be changed to "active" (as it is going to be put +into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(), +the function will set the power.direct_complete flag for it (to make the PM core +skip the subsequent "thaw" callbacks for it) and return. + +Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the +device to be left in suspend after system-wide transitions to the working state. +This flag is checked by the PM core, but the PCI bus type informs the PM core +which devices may be left in suspend from its perspective (that happens during +the "noirq" phase of system-wide suspend and analogous transitions) and next it +uses the dev_pm_may_skip_resume() helper to decide whether or not to return from +pci_pm_resume_noirq() early, as the PM core will skip the remaining resume +callbacks for the device during the transition under way and will set its +runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for +it. + +3.2. Device Runtime Power Management +------------------------------------ + +In addition to providing device power management callbacks PCI device drivers +are responsible for controlling the runtime power management (runtime PM) of +their devices. + +The PCI device runtime PM is optional, but it is recommended that PCI device +drivers implement it at least in the cases where there is a reliable way of +verifying that the device is not used (like when the network cable is detached +from an Ethernet adapter or there are no devices attached to a USB controller). + +To support the PCI runtime PM the driver first needs to implement the +runtime_suspend() and runtime_resume() callbacks. It also may need to implement +the runtime_idle() callback to prevent the device from being suspended again +every time right after the runtime_resume() callback has returned +(alternatively, the runtime_suspend() callback will have to check if the +device should really be suspended and return -EAGAIN if that is not the case). + +The runtime PM of PCI devices is enabled by default by the PCI core. PCI +device drivers do not need to enable it and should not attempt to do so. +However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid() +helper function. In addition to that, the runtime PM usage counter of +each PCI device is incremented by local_pci_probe() before executing the +probe callback provided by the device's driver. + +If a PCI driver implements the runtime PM callbacks and intends to use the +runtime PM framework provided by the PM core and the PCI subsystem, it needs +to decrement the device's runtime PM usage counter in its probe callback +function. If it doesn't do that, the counter will always be different from +zero for the device and it will never be runtime-suspended. The simplest +way to do that is by calling pm_runtime_put_noidle(), but if the driver +wants to schedule an autosuspend right away, for example, it may call +pm_runtime_put_autosuspend() instead for this purpose. Generally, it +just needs to call a function that decrements the devices usage counter +from its probe routine to make runtime PM work for the device. + +It is important to remember that the driver's runtime_suspend() callback +may be executed right after the usage counter has been decremented, because +user space may already have caused the pm_runtime_allow() helper function +unblocking the runtime PM of the device to run via sysfs, so the driver must +be prepared to cope with that. + +The driver itself should not call pm_runtime_allow(), though. Instead, it +should let user space or some platform-specific code do that (user space can +do it via sysfs as stated above), but it must be prepared to handle the +runtime PM of the device correctly as soon as pm_runtime_allow() is called +(which may happen at any time, even before the driver is loaded). + +When the driver's remove callback runs, it has to balance the decrementation +of the device's runtime PM usage counter at the probe time. For this reason, +if it has decremented the counter in its probe callback, it must run +pm_runtime_get_noresume() in its remove callback. [Since the core carries +out a runtime resume of the device and bumps up the device's usage counter +before running the driver's remove callback, the runtime PM of the device +is effectively disabled for the duration of the remove execution and all +runtime PM helper functions incrementing the device's usage counter are +then effectively equivalent to pm_runtime_get_noresume().] + +The runtime PM framework works by processing requests to suspend or resume +devices, or to check if they are idle (in which cases it is reasonable to +subsequently request that they be suspended). These requests are represented +by work items put into the power management workqueue, pm_wq. Although there +are a few situations in which power management requests are automatically +queued by the PM core (for example, after processing a request to resume a +device the PM core automatically queues a request to check if the device is +idle), device drivers are generally responsible for queuing power management +requests for their devices. For this purpose they should use the runtime PM +helper functions provided by the PM core, discussed in +Documentation/power/runtime_pm.rst. + +Devices can also be suspended and resumed synchronously, without placing a +request into pm_wq. In the majority of cases this also is done by their +drivers that use helper functions provided by the PM core for this purpose. + +For more information on the runtime PM of devices refer to +Documentation/power/runtime_pm.rst. + + +4. Resources +============ + +PCI Local Bus Specification, Rev. 3.0 + +PCI Bus Power Management Interface Specification, Rev. 1.2 + +Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b + +PCI Express Base Specification, Rev. 2.0 + +Documentation/driver-api/pm/devices.rst + +Documentation/power/runtime_pm.rst diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt deleted file mode 100644 index 8eaf9ee24d43..000000000000 --- a/Documentation/power/pci.txt +++ /dev/null @@ -1,1094 +0,0 @@ -PCI Power Management - -Copyright (c) 2010 Rafael J. Wysocki , Novell Inc. - -An overview of concepts and the Linux kernel's interfaces related to PCI power -management. Based on previous work by Patrick Mochel -(and others). - -This document only covers the aspects of power management specific to PCI -devices. For general description of the kernel's interfaces related to device -power management refer to Documentation/driver-api/pm/devices.rst and -Documentation/power/runtime_pm.txt. - ---------------------------------------------------------------------------- - -1. Hardware and Platform Support for PCI Power Management -2. PCI Subsystem and Device Power Management -3. PCI Device Drivers and Power Management -4. Resources - - -1. Hardware and Platform Support for PCI Power Management -========================================================= - -1.1. Native and Platform-Based Power Management ------------------------------------------------ -In general, power management is a feature allowing one to save energy by putting -devices into states in which they draw less power (low-power states) at the -price of reduced functionality or performance. - -Usually, a device is put into a low-power state when it is underutilized or -completely inactive. However, when it is necessary to use the device once -again, it has to be put back into the "fully functional" state (full-power -state). This may happen when there are some data for the device to handle or -as a result of an external event requiring the device to be active, which may -be signaled by the device itself. - -PCI devices may be put into low-power states in two ways, by using the device -capabilities introduced by the PCI Bus Power Management Interface Specification, -or with the help of platform firmware, such as an ACPI BIOS. In the first -approach, that is referred to as the native PCI power management (native PCI PM) -in what follows, the device power state is changed as a result of writing a -specific value into one of its standard configuration registers. The second -approach requires the platform firmware to provide special methods that may be -used by the kernel to change the device's power state. - -Devices supporting the native PCI PM usually can generate wakeup signals called -Power Management Events (PMEs) to let the kernel know about external events -requiring the device to be active. After receiving a PME the kernel is supposed -to put the device that sent it into the full-power state. However, the PCI Bus -Power Management Interface Specification doesn't define any standard method of -delivering the PME from the device to the CPU and the operating system kernel. -It is assumed that the platform firmware will perform this task and therefore, -even though a PCI device is set up to generate PMEs, it also may be necessary to -prepare the platform firmware for notifying the CPU of the PMEs coming from the -device (e.g. by generating interrupts). - -In turn, if the methods provided by the platform firmware are used for changing -the power state of a device, usually the platform also provides a method for -preparing the device to generate wakeup signals. In that case, however, it -often also is necessary to prepare the device for generating PMEs using the -native PCI PM mechanism, because the method provided by the platform depends on -that. - -Thus in many situations both the native and the platform-based power management -mechanisms have to be used simultaneously to obtain the desired result. - -1.2. Native PCI Power Management --------------------------------- -The PCI Bus Power Management Interface Specification (PCI PM Spec) was -introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a -standard interface for performing various operations related to power -management. - -The implementation of the PCI PM Spec is optional for conventional PCI devices, -but it is mandatory for PCI Express devices. If a device supports the PCI PM -Spec, it has an 8 byte power management capability field in its PCI -configuration space. This field is used to describe and control the standard -features related to the native PCI power management. - -The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses -(B0-B3). The higher the number, the less power is drawn by the device or bus -in that state. However, the higher the number, the longer the latency for -the device or bus to return to the full-power state (D0 or B0, respectively). - -There are two variants of the D3 state defined by the specification. The first -one is D3hot, referred to as the software accessible D3, because devices can be -programmed to go into it. The second one, D3cold, is the state that PCI devices -are in when the supply voltage (Vcc) is removed from them. It is not possible -to program a PCI device to go into D3cold, although there may be a programmable -interface for putting the bus the device is on into a state in which Vcc is -removed from all devices on the bus. - -PCI bus power management, however, is not supported by the Linux kernel at the -time of this writing and therefore it is not covered by this document. - -Note that every PCI device can be in the full-power state (D0) or in D3cold, -regardless of whether or not it implements the PCI PM Spec. In addition to -that, if the PCI PM Spec is implemented by the device, it must support D3hot -as well as D0. The support for the D1 and D2 power states is optional. - -PCI devices supporting the PCI PM Spec can be programmed to go to any of the -supported low-power states (except for D3cold). While in D1-D3hot the -standard configuration registers of the device must be accessible to software -(i.e. the device is required to respond to PCI configuration accesses), although -its I/O and memory spaces are then disabled. This allows the device to be -programmatically put into D0. Thus the kernel can switch the device back and -forth between D0 and the supported low-power states (except for D3cold) and the -possible power state transitions the device can undergo are the following: - -+----------------------------+ -| Current State | New State | -+----------------------------+ -| D0 | D1, D2, D3 | -+----------------------------+ -| D1 | D2, D3 | -+----------------------------+ -| D2 | D3 | -+----------------------------+ -| D1, D2, D3 | D0 | -+----------------------------+ - -The transition from D3cold to D0 occurs when the supply voltage is provided to -the device (i.e. power is restored). In that case the device returns to D0 with -a full power-on reset sequence and the power-on defaults are restored to the -device by hardware just as at initial power up. - -PCI devices supporting the PCI PM Spec can be programmed to generate PMEs -while in a low-power state (D1-D3), but they are not required to be capable -of generating PMEs from all supported low-power states. In particular, the -capability of generating PMEs from D3cold is optional and depends on the -presence of additional voltage (3.3Vaux) allowing the device to remain -sufficiently active to generate a wakeup signal. - -1.3. ACPI Device Power Management ---------------------------------- -The platform firmware support for the power management of PCI devices is -system-specific. However, if the system in question is compliant with the -Advanced Configuration and Power Interface (ACPI) Specification, like the -majority of x86-based systems, it is supposed to implement device power -management interfaces defined by the ACPI standard. - -For this purpose the ACPI BIOS provides special functions called "control -methods" that may be executed by the kernel to perform specific tasks, such as -putting a device into a low-power state. These control methods are encoded -using special byte-code language called the ACPI Machine Language (AML) and -stored in the machine's BIOS. The kernel loads them from the BIOS and executes -them as needed using an AML interpreter that translates the AML byte code into -computations and memory or I/O space accesses. This way, in theory, a BIOS -writer can provide the kernel with a means to perform actions depending -on the system design in a system-specific fashion. - -ACPI control methods may be divided into global control methods, that are not -associated with any particular devices, and device control methods, that have -to be defined separately for each device supposed to be handled with the help of -the platform. This means, in particular, that ACPI device control methods can -only be used to handle devices that the BIOS writer knew about in advance. The -ACPI methods used for device power management fall into that category. - -The ACPI specification assumes that devices can be in one of four power states -labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM -D0-D3 states (although the difference between D3hot and D3cold is not taken -into account by ACPI). Moreover, for each power state of a device there is a -set of power resources that have to be enabled for the device to be put into -that state. These power resources are controlled (i.e. enabled or disabled) -with the help of their own control methods, _ON and _OFF, that have to be -defined individually for each of them. - -To put a device into the ACPI power state Dx (where x is a number between 0 and -3 inclusive) the kernel is supposed to (1) enable the power resources required -by the device in this state using their _ON control methods and (2) execute the -_PSx control method defined for the device. In addition to that, if the device -is going to be put into a low-power state (D1-D3) and is supposed to generate -wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI -3.0) control method defined for it has to be executed before _PSx. Power -resources that are not required by the device in the target power state and are -not required any more by any other device should be disabled (by executing their -_OFF control methods). If the current power state of the device is D3, it can -only be put into D0 this way. - -However, quite often the power states of devices are changed during a -system-wide transition into a sleep state or back into the working state. ACPI -defines four system sleep states, S1, S2, S3, and S4, and denotes the system -working state as S0. In general, the target system sleep (or working) state -determines the highest power (lowest number) state the device can be put -into and the kernel is supposed to obtain this information by executing the -device's _SxD control method (where x is a number between 0 and 4 inclusive). -If the device is required to wake up the system from the target sleep state, the -lowest power (highest number) state it can be put into is also determined by the -target state of the system. The kernel is then supposed to use the device's -_SxW control method to obtain the number of that state. It also is supposed to -use the device's _PRW control method to learn which power resources need to be -enabled for the device to be able to generate wakeup signals. - -1.4. Wakeup Signaling ---------------------- -Wakeup signals generated by PCI devices, either as native PCI PMEs, or as -a result of the execution of the _DSW (or _PSW) ACPI control method before -putting the device into a low-power state, have to be caught and handled as -appropriate. If they are sent while the system is in the working state -(ACPI S0), they should be translated into interrupts so that the kernel can -put the devices generating them into the full-power state and take care of the -events that triggered them. In turn, if they are sent while the system is -sleeping, they should cause the system's core logic to trigger wakeup. - -On ACPI-based systems wakeup signals sent by conventional PCI devices are -converted into ACPI General-Purpose Events (GPEs) which are hardware signals -from the system core logic generated in response to various events that need to -be acted upon. Every GPE is associated with one or more sources of potentially -interesting events. In particular, a GPE may be associated with a PCI device -capable of signaling wakeup. The information on the connections between GPEs -and event sources is recorded in the system's ACPI BIOS from where it can be -read by the kernel. - -If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE -associated with it (if there is one) is triggered. The GPEs associated with PCI -bridges may also be triggered in response to a wakeup signal from one of the -devices below the bridge (this also is the case for root bridges) and, for -example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be -handled this way. - -A GPE may be triggered when the system is sleeping (i.e. when it is in one of -the ACPI S1-S4 states), in which case system wakeup is started by its core logic -(the device that was the source of the signal causing the system wakeup to occur -may be identified later). The GPEs used in such situations are referred to as -wakeup GPEs. - -Usually, however, GPEs are also triggered when the system is in the working -state (ACPI S0) and in that case the system's core logic generates a System -Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI -handler identifies the GPE that caused the interrupt to be generated which, -in turn, allows the kernel to identify the source of the event (that may be -a PCI device signaling wakeup). The GPEs used for notifying the kernel of -events occurring while the system is in the working state are referred to as -runtime GPEs. - -Unfortunately, there is no standard way of handling wakeup signals sent by -conventional PCI devices on systems that are not ACPI-based, but there is one -for PCI Express devices. Namely, the PCI Express Base Specification introduced -a native mechanism for converting native PCI PMEs into interrupts generated by -root ports. For conventional PCI devices native PMEs are out-of-band, so they -are routed separately and they need not pass through bridges (in principle they -may be routed directly to the system's core logic), but for PCI Express devices -they are in-band messages that have to pass through the PCI Express hierarchy, -including the root port on the path from the device to the Root Complex. Thus -it was possible to introduce a mechanism by which a root port generates an -interrupt whenever it receives a PME message from one of the devices below it. -The PCI Express Requester ID of the device that sent the PME message is then -recorded in one of the root port's configuration registers from where it may be -read by the interrupt handler allowing the device to be identified. [PME -messages sent by PCI Express endpoints integrated with the Root Complex don't -pass through root ports, but instead they cause a Root Complex Event Collector -(if there is one) to generate interrupts.] - -In principle the native PCI Express PME signaling may also be used on ACPI-based -systems along with the GPEs, but to use it the kernel has to ask the system's -ACPI BIOS to release control of root port configuration registers. The ACPI -BIOS, however, is not required to allow the kernel to control these registers -and if it doesn't do that, the kernel must not modify their contents. Of course -the native PCI Express PME signaling cannot be used by the kernel in that case. - - -2. PCI Subsystem and Device Power Management -============================================ - -2.1. Device Power Management Callbacks --------------------------------------- -The PCI Subsystem participates in the power management of PCI devices in a -number of ways. First of all, it provides an intermediate code layer between -the device power management core (PM core) and PCI device drivers. -Specifically, the pm field of the PCI subsystem's struct bus_type object, -pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing -pointers to several device power management callbacks: - -const struct dev_pm_ops pci_dev_pm_ops = { - .prepare = pci_pm_prepare, - .complete = pci_pm_complete, - .suspend = pci_pm_suspend, - .resume = pci_pm_resume, - .freeze = pci_pm_freeze, - .thaw = pci_pm_thaw, - .poweroff = pci_pm_poweroff, - .restore = pci_pm_restore, - .suspend_noirq = pci_pm_suspend_noirq, - .resume_noirq = pci_pm_resume_noirq, - .freeze_noirq = pci_pm_freeze_noirq, - .thaw_noirq = pci_pm_thaw_noirq, - .poweroff_noirq = pci_pm_poweroff_noirq, - .restore_noirq = pci_pm_restore_noirq, - .runtime_suspend = pci_pm_runtime_suspend, - .runtime_resume = pci_pm_runtime_resume, - .runtime_idle = pci_pm_runtime_idle, -}; - -These callbacks are executed by the PM core in various situations related to -device power management and they, in turn, execute power management callbacks -provided by PCI device drivers. They also perform power management operations -involving some standard configuration registers of PCI devices that device -drivers need not know or care about. - -The structure representing a PCI device, struct pci_dev, contains several fields -that these callbacks operate on: - -struct pci_dev { - ... - pci_power_t current_state; /* Current operating state. */ - int pm_cap; /* PM capability offset in the - configuration space */ - unsigned int pme_support:5; /* Bitmask of states from which PME# - can be generated */ - unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */ - unsigned int d1_support:1; /* Low power state D1 is supported */ - unsigned int d2_support:1; /* Low power state D2 is supported */ - unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ - unsigned int wakeup_prepared:1; /* Device prepared for wake up */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ - ... -}; - -They also indirectly use some fields of the struct device that is embedded in -struct pci_dev. - -2.2. Device Initialization --------------------------- -The PCI subsystem's first task related to device power management is to -prepare the device for power management and initialize the fields of struct -pci_dev used for this purpose. This happens in two functions defined in -drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). - -The first of these functions checks if the device supports native PCI PM -and if that's the case the offset of its power management capability structure -in the configuration space is stored in the pm_cap field of the device's struct -pci_dev object. Next, the function checks which PCI low-power states are -supported by the device and from which low-power states the device can generate -native PCI PMEs. The power management fields of the device's struct pci_dev and -the struct device embedded in it are updated accordingly and the generation of -PMEs by the device is disabled. - -The second function checks if the device can be prepared to signal wakeup with -the help of the platform firmware, such as the ACPI BIOS. If that is the case, -the function updates the wakeup fields in struct device embedded in the -device's struct pci_dev and uses the firmware-provided method to prevent the -device from signaling wakeup. - -At this point the device is ready for power management. For driverless devices, -however, this functionality is limited to a few basic operations carried out -during system-wide transitions to a sleep state and back to the working state. - -2.3. Runtime Device Power Management ------------------------------------- -The PCI subsystem plays a vital role in the runtime power management of PCI -devices. For this purpose it uses the general runtime power management -(runtime PM) framework described in Documentation/power/runtime_pm.txt. -Namely, it provides subsystem-level callbacks: - - pci_pm_runtime_suspend() - pci_pm_runtime_resume() - pci_pm_runtime_idle() - -that are executed by the core runtime PM routines. It also implements the -entire mechanics necessary for handling runtime wakeup signals from PCI devices -in low-power states, which at the time of this writing works for both the native -PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in -Section 1. - -First, a PCI device is put into a low-power state, or suspended, with the help -of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call -pci_pm_runtime_suspend() to do the actual job. For this to work, the device's -driver has to provide a pm->runtime_suspend() callback (see below), which is -run by pci_pm_runtime_suspend() as the first action. If the driver's callback -returns successfully, the device's standard configuration registers are saved, -the device is prepared to generate wakeup signals and, finally, it is put into -the target low-power state. - -The low-power state to put the device into is the lowest-power (highest number) -state from which it can signal wakeup. The exact method of signaling wakeup is -system-dependent and is determined by the PCI subsystem on the basis of the -reported capabilities of the device and the platform firmware. To prepare the -device for signaling wakeup and put it into the selected low-power state, the -PCI subsystem can use the platform firmware as well as the device's native PCI -PM capabilities, if supported. - -It is expected that the device driver's pm->runtime_suspend() callback will -not attempt to prepare the device for signaling wakeup or to put it into a -low-power state. The driver ought to leave these tasks to the PCI subsystem -that has all of the information necessary to perform them. - -A suspended device is brought back into the "active" state, or resumed, -with the help of pm_request_resume() or pm_runtime_resume() which both call -pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's -driver provides a pm->runtime_resume() callback (see below). However, before -the driver's callback is executed, pci_pm_runtime_resume() brings the device -back into the full-power state, prevents it from signaling wakeup while in that -state and restores its standard configuration registers. Thus the driver's -callback need not worry about the PCI-specific aspects of the device resume. - -Note that generally pci_pm_runtime_resume() may be called in two different -situations. First, it may be called at the request of the device's driver, for -example if there are some data for it to process. Second, it may be called -as a result of a wakeup signal from the device itself (this sometimes is -referred to as "remote wakeup"). Of course, for this purpose the wakeup signal -is handled in one of the ways described in Section 1 and finally converted into -a notification for the PCI subsystem after the source device has been -identified. - -The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle() -and pm_request_idle(), executes the device driver's pm->runtime_idle() -callback, if defined, and if that callback doesn't return error code (or is not -present at all), suspends the device with the help of pm_runtime_suspend(). -Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for -example, it is called right after the device has just been resumed), in which -cases it is expected to suspend the device if that makes sense. Usually, -however, the PCI subsystem doesn't really know if the device really can be -suspended, so it lets the device's driver decide by running its -pm->runtime_idle() callback. - -2.4. System-Wide Power Transitions ----------------------------------- -There are a few different types of system-wide power transitions, described in -Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled -in a specific way and the PM core executes subsystem-level power management -callbacks for this purpose. They are executed in phases such that each phase -involves executing the same subsystem-level callback for every device belonging -to the given subsystem before the next phase begins. These phases always run -after tasks have been frozen. - -2.4.1. System Suspend - -When the system is going into a sleep state in which the contents of memory will -be preserved, such as one of the ACPI sleep states S1-S3, the phases are: - - prepare, suspend, suspend_noirq. - -The following PCI bus type's callbacks, respectively, are used in these phases: - - pci_pm_prepare() - pci_pm_suspend() - pci_pm_suspend_noirq() - -The pci_pm_prepare() routine first puts the device into the "fully functional" -state with the help of pm_runtime_resume(). Then, it executes the device -driver's pm->prepare() callback if defined (i.e. if the driver's struct -dev_pm_ops object is present and the prepare pointer in that object is valid). - -The pci_pm_suspend() routine first checks if the device's driver implements -legacy PCI suspend routines (see Section 3), in which case the driver's legacy -suspend callback is executed, if present, and its result is returned. Next, if -the device's driver doesn't provide a struct dev_pm_ops object (containing -pointers to the driver's callbacks), pci_pm_default_suspend() is called, which -simply turns off the device's bus master capability and runs -pcibios_disable_device() to disable it, unless the device is a bridge (PCI -bridges are ignored by this routine). Next, the device driver's pm->suspend() -callback is executed, if defined, and its result is returned if it fails. -Finally, pci_fixup_device() is called to apply hardware suspend quirks related -to the device if necessary. - -Note that the suspend phase is carried out asynchronously for PCI devices, so -the pci_pm_suspend() callback may be executed in parallel for any pair of PCI -devices that don't depend on each other in a known way (i.e. none of the paths -in the device tree from the root bridge to a leaf device contains both of them). - -The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has -been called, which means that the device driver's interrupt handler won't be -invoked while this routine is running. It first checks if the device's driver -implements legacy PCI suspends routines (Section 3), in which case the legacy -late suspend routine is called and its result is returned (the standard -configuration registers of the device are saved if the driver's callback hasn't -done that). Second, if the device driver's struct dev_pm_ops object is not -present, the device's standard configuration registers are saved and the routine -returns success. Otherwise the device driver's pm->suspend_noirq() callback is -executed, if present, and its result is returned if it fails. Next, if the -device's standard configuration registers haven't been saved yet (one of the -device driver's callbacks executed before might do that), pci_pm_suspend_noirq() -saves them, prepares the device to signal wakeup (if necessary) and puts it into -a low-power state. - -The low-power state to put the device into is the lowest-power (highest number) -state from which it can signal wakeup while the system is in the target sleep -state. Just like in the runtime PM case described above, the mechanism of -signaling wakeup is system-dependent and determined by the PCI subsystem, which -is also responsible for preparing the device to signal wakeup from the system's -target sleep state as appropriate. - -PCI device drivers (that don't implement legacy power management callbacks) are -generally not expected to prepare devices for signaling wakeup or to put them -into low-power states. However, if one of the driver's suspend callbacks -(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration -registers, pci_pm_suspend_noirq() will assume that the device has been prepared -to signal wakeup and put into a low-power state by the driver (the driver is -then assumed to have used the helper functions provided by the PCI subsystem for -this purpose). PCI device drivers are not encouraged to do that, but in some -rare cases doing that in the driver may be the optimum approach. - -2.4.2. System Resume - -When the system is undergoing a transition from a sleep state in which the -contents of memory have been preserved, such as one of the ACPI sleep states -S1-S3, into the working state (ACPI S0), the phases are: - - resume_noirq, resume, complete. - -The following PCI bus type's callbacks, respectively, are executed in these -phases: - - pci_pm_resume_noirq() - pci_pm_resume() - pci_pm_complete() - -The pci_pm_resume_noirq() routine first puts the device into the full-power -state, restores its standard configuration registers and applies early resume -hardware quirks related to the device, if necessary. This is done -unconditionally, regardless of whether or not the device's driver implements -legacy PCI power management callbacks (this way all PCI devices are in the -full-power state and their standard configuration registers have been restored -when their interrupt handlers are invoked for the first time during resume, -which allows the kernel to avoid problems with the handling of shared interrupts -by drivers whose devices are still suspended). If legacy PCI power management -callbacks (see Section 3) are implemented by the device's driver, the legacy -early resume callback is executed and its result is returned. Otherwise, the -device driver's pm->resume_noirq() callback is executed, if defined, and its -result is returned. - -The pci_pm_resume() routine first checks if the device's standard configuration -registers have been restored and restores them if that's not the case (this -only is necessary in the error path during a failing suspend). Next, resume -hardware quirks related to the device are applied, if necessary, and if the -device's driver implements legacy PCI power management callbacks (see -Section 3), the driver's legacy resume callback is executed and its result is -returned. Otherwise, the device's wakeup signaling mechanisms are blocked and -its driver's pm->resume() callback is executed, if defined (the callback's -result is then returned). - -The resume phase is carried out asynchronously for PCI devices, like the -suspend phase described above, which means that if two PCI devices don't depend -on each other in a known way, the pci_pm_resume() routine may be executed for -the both of them in parallel. - -The pci_pm_complete() routine only executes the device driver's pm->complete() -callback, if defined. - -2.4.3. System Hibernation - -System hibernation is more complicated than system suspend, because it requires -a system image to be created and written into a persistent storage medium. The -image is created atomically and all devices are quiesced, or frozen, before that -happens. - -The freezing of devices is carried out after enough memory has been freed (at -the time of this writing the image creation requires at least 50% of system RAM -to be free) in the following three phases: - - prepare, freeze, freeze_noirq - -that correspond to the PCI bus type's callbacks: - - pci_pm_prepare() - pci_pm_freeze() - pci_pm_freeze_noirq() - -This means that the prepare phase is exactly the same as for system suspend. -The other two phases, however, are different. - -The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs -the device driver's pm->freeze() callback, if defined, instead of pm->suspend(), -and it doesn't apply the suspend-related hardware quirks. It is executed -asynchronously for different PCI devices that don't depend on each other in a -known way. - -The pci_pm_freeze_noirq() routine, in turn, is similar to -pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq() -routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the -device for signaling wakeup and put it into a low-power state. Still, it saves -the device's standard configuration registers if they haven't been saved by one -of the driver's callbacks. - -Once the image has been created, it has to be saved. However, at this point all -devices are frozen and they cannot handle I/O, while their ability to handle -I/O is obviously necessary for the image saving. Thus they have to be brought -back to the fully functional state and this is done in the following phases: - - thaw_noirq, thaw, complete - -using the following PCI bus type's callbacks: - - pci_pm_thaw_noirq() - pci_pm_thaw() - pci_pm_complete() - -respectively. - -The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(), -but it doesn't put the device into the full power state and doesn't attempt to -restore its standard configuration registers. It also executes the device -driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq(). - -The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device -driver's pm->thaw() callback instead of pm->resume(). It is executed -asynchronously for different PCI devices that don't depend on each other in a -known way. - -The complete phase it the same as for system resume. - -After saving the image, devices need to be powered down before the system can -enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in -three phases: - - prepare, poweroff, poweroff_noirq - -where the prepare phase is exactly the same as for system suspend. The other -two phases are analogous to the suspend and suspend_noirq phases, respectively. -The PCI subsystem-level callbacks they correspond to - - pci_pm_poweroff() - pci_pm_poweroff_noirq() - -work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively, -although they don't attempt to save the device's standard configuration -registers. - -2.4.4. System Restore - -System restore requires a hibernation image to be loaded into memory and the -pre-hibernation memory contents to be restored before the pre-hibernation system -activity can be resumed. - -As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded -into memory by a fresh instance of the kernel, called the boot kernel, which in -turn is loaded and run by a boot loader in the usual way. After the boot kernel -has loaded the image, it needs to replace its own code and data with the code -and data of the "hibernated" kernel stored within the image, called the image -kernel. For this purpose all devices are frozen just like before creating -the image during hibernation, in the - - prepare, freeze, freeze_noirq - -phases described above. However, the devices affected by these phases are only -those having drivers in the boot kernel; other devices will still be in whatever -state the boot loader left them. - -Should the restoration of the pre-hibernation memory contents fail, the boot -kernel would go through the "thawing" procedure described above, using the -thaw_noirq, thaw, and complete phases (that will only affect the devices having -drivers in the boot kernel), and then continue running normally. - -If the pre-hibernation memory contents are restored successfully, which is the -usual situation, control is passed to the image kernel, which then becomes -responsible for bringing the system back to the working state. To achieve this, -it must restore the devices' pre-hibernation functionality, which is done much -like waking up from the memory sleep state, although it involves different -phases: - - restore_noirq, restore, complete - -The first two of these are analogous to the resume_noirq and resume phases -described above, respectively, and correspond to the following PCI subsystem -callbacks: - - pci_pm_restore_noirq() - pci_pm_restore() - -These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(), -respectively, but they execute the device driver's pm->restore_noirq() and -pm->restore() callbacks, if available. - -The complete phase is carried out in exactly the same way as during system -resume. - - -3. PCI Device Drivers and Power Management -========================================== - -3.1. Power Management Callbacks -------------------------------- -PCI device drivers participate in power management by providing callbacks to be -executed by the PCI subsystem's power management routines described above and by -controlling the runtime power management of their devices. - -At the time of this writing there are two ways to define power management -callbacks for a PCI device driver, the recommended one, based on using a -dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the -"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and -.resume() callbacks from struct pci_driver are used. The legacy approach, -however, doesn't allow one to define runtime power management callbacks and is -not really suitable for any new drivers. Therefore it is not covered by this -document (refer to the source code to learn more about it). - -It is recommended that all PCI device drivers define a struct dev_pm_ops object -containing pointers to power management (PM) callbacks that will be executed by -the PCI subsystem's PM routines in various circumstances. A pointer to the -driver's struct dev_pm_ops object has to be assigned to the driver.pm field in -its struct pci_driver object. Once that has happened, the "legacy" PM callbacks -in struct pci_driver are ignored (even if they are not NULL). - -The PM callbacks in struct dev_pm_ops are not mandatory and if they are not -defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI -subsystem will handle the device in a simplified default manner. If they are -defined, though, they are expected to behave as described in the following -subsections. - -3.1.1. prepare() - -The prepare() callback is executed during system suspend, during hibernation -(when a hibernation image is about to be created), during power-off after -saving a hibernation image and during system restore, when a hibernation image -has just been loaded into memory. - -This callback is only necessary if the driver's device has children that in -general may be registered at any time. In that case the role of the prepare() -callback is to prevent new children of the device from being registered until -one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run. - -In addition to that the prepare() callback may carry out some operations -preparing the device to be suspended, although it should not allocate memory -(if additional memory is required to suspend the device, it has to be -preallocated earlier, for example in a suspend/hibernate notifier as described -in Documentation/driver-api/pm/notifiers.rst). - -3.1.2. suspend() - -The suspend() callback is only executed during system suspend, after prepare() -callbacks have been executed for all devices in the system. - -This callback is expected to quiesce the device and prepare it to be put into a -low-power state by the PCI subsystem. It is not required (in fact it even is -not recommended) that a PCI driver's suspend() callback save the standard -configuration registers of the device, prepare it for waking up the system, or -put it into a low-power state. All of these operations can very well be taken -care of by the PCI subsystem, without the driver's participation. - -However, in some rare case it is convenient to carry out these operations in -a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and -pci_set_power_state() should be used to save the device's standard configuration -registers, to prepare it for system wakeup (if necessary), and to put it into a -low-power state, respectively. Moreover, if the driver calls pci_save_state(), -the PCI subsystem will not execute either pci_prepare_to_sleep(), or -pci_set_power_state() for its device, so the driver is then responsible for -handling the device as appropriate. - -While the suspend() callback is being executed, the driver's interrupt handler -can be invoked to handle an interrupt from the device, so all suspend-related -operations relying on the driver's ability to handle interrupts should be -carried out in this callback. - -3.1.3. suspend_noirq() - -The suspend_noirq() callback is only executed during system suspend, after -suspend() callbacks have been executed for all devices in the system and -after device interrupts have been disabled by the PM core. - -The difference between suspend_noirq() and suspend() is that the driver's -interrupt handler will not be invoked while suspend_noirq() is running. Thus -suspend_noirq() can carry out operations that would cause race conditions to -arise if they were performed in suspend(). - -3.1.4. freeze() - -The freeze() callback is hibernation-specific and is executed in two situations, -during hibernation, after prepare() callbacks have been executed for all devices -in preparation for the creation of a system image, and during restore, -after a system image has been loaded into memory from persistent storage and the -prepare() callbacks have been executed for all devices. - -The role of this callback is analogous to the role of the suspend() callback -described above. In fact, they only need to be different in the rare cases when -the driver takes the responsibility for putting the device into a low-power -state. - -In that cases the freeze() callback should not prepare the device system wakeup -or put it into a low-power state. Still, either it or freeze_noirq() should -save the device's standard configuration registers using pci_save_state(). - -3.1.5. freeze_noirq() - -The freeze_noirq() callback is hibernation-specific. It is executed during -hibernation, after prepare() and freeze() callbacks have been executed for all -devices in preparation for the creation of a system image, and during restore, -after a system image has been loaded into memory and after prepare() and -freeze() callbacks have been executed for all devices. It is always executed -after device interrupts have been disabled by the PM core. - -The role of this callback is analogous to the role of the suspend_noirq() -callback described above and it very rarely is necessary to define -freeze_noirq(). - -The difference between freeze_noirq() and freeze() is analogous to the -difference between suspend_noirq() and suspend(). - -3.1.6. poweroff() - -The poweroff() callback is hibernation-specific. It is executed when the system -is about to be powered off after saving a hibernation image to a persistent -storage. prepare() callbacks are executed for all devices before poweroff() is -called. - -The role of this callback is analogous to the role of the suspend() and freeze() -callbacks described above, although it does not need to save the contents of -the device's registers. In particular, if the driver wants to put the device -into a low-power state itself instead of allowing the PCI subsystem to do that, -the poweroff() callback should use pci_prepare_to_sleep() and -pci_set_power_state() to prepare the device for system wakeup and to put it -into a low-power state, respectively, but it need not save the device's standard -configuration registers. - -3.1.7. poweroff_noirq() - -The poweroff_noirq() callback is hibernation-specific. It is executed after -poweroff() callbacks have been executed for all devices in the system. - -The role of this callback is analogous to the role of the suspend_noirq() and -freeze_noirq() callbacks described above, but it does not need to save the -contents of the device's registers. - -The difference between poweroff_noirq() and poweroff() is analogous to the -difference between suspend_noirq() and suspend(). - -3.1.8. resume_noirq() - -The resume_noirq() callback is only executed during system resume, after the -PM core has enabled the non-boot CPUs. The driver's interrupt handler will not -be invoked while resume_noirq() is running, so this callback can carry out -operations that might race with the interrupt handler. - -Since the PCI subsystem unconditionally puts all devices into the full power -state in the resume_noirq phase of system resume and restores their standard -configuration registers, resume_noirq() is usually not necessary. In general -it should only be used for performing operations that would lead to race -conditions if carried out by resume(). - -3.1.9. resume() - -The resume() callback is only executed during system resume, after -resume_noirq() callbacks have been executed for all devices in the system and -device interrupts have been enabled by the PM core. - -This callback is responsible for restoring the pre-suspend configuration of the -device and bringing it back to the fully functional state. The device should be -able to process I/O in a usual way after resume() has returned. - -3.1.10. thaw_noirq() - -The thaw_noirq() callback is hibernation-specific. It is executed after a -system image has been created and the non-boot CPUs have been enabled by the PM -core, in the thaw_noirq phase of hibernation. It also may be executed if the -loading of a hibernation image fails during system restore (it is then executed -after enabling the non-boot CPUs). The driver's interrupt handler will not be -invoked while thaw_noirq() is running. - -The role of this callback is analogous to the role of resume_noirq(). The -difference between these two callbacks is that thaw_noirq() is executed after -freeze() and freeze_noirq(), so in general it does not need to modify the -contents of the device's registers. - -3.1.11. thaw() - -The thaw() callback is hibernation-specific. It is executed after thaw_noirq() -callbacks have been executed for all devices in the system and after device -interrupts have been enabled by the PM core. - -This callback is responsible for restoring the pre-freeze configuration of -the device, so that it will work in a usual way after thaw() has returned. - -3.1.12. restore_noirq() - -The restore_noirq() callback is hibernation-specific. It is executed in the -restore_noirq phase of hibernation, when the boot kernel has passed control to -the image kernel and the non-boot CPUs have been enabled by the image kernel's -PM core. - -This callback is analogous to resume_noirq() with the exception that it cannot -make any assumption on the previous state of the device, even if the BIOS (or -generally the platform firmware) is known to preserve that state over a -suspend-resume cycle. - -For the vast majority of PCI device drivers there is no difference between -resume_noirq() and restore_noirq(). - -3.1.13. restore() - -The restore() callback is hibernation-specific. It is executed after -restore_noirq() callbacks have been executed for all devices in the system and -after the PM core has enabled device drivers' interrupt handlers to be invoked. - -This callback is analogous to resume(), just like restore_noirq() is analogous -to resume_noirq(). Consequently, the difference between restore_noirq() and -restore() is analogous to the difference between resume_noirq() and resume(). - -For the vast majority of PCI device drivers there is no difference between -resume() and restore(). - -3.1.14. complete() - -The complete() callback is executed in the following situations: - - during system resume, after resume() callbacks have been executed for all - devices, - - during hibernation, before saving the system image, after thaw() callbacks - have been executed for all devices, - - during system restore, when the system is going back to its pre-hibernation - state, after restore() callbacks have been executed for all devices. -It also may be executed if the loading of a hibernation image into memory fails -(in that case it is run after thaw() callbacks have been executed for all -devices that have drivers in the boot kernel). - -This callback is entirely optional, although it may be necessary if the -prepare() callback performs operations that need to be reversed. - -3.1.15. runtime_suspend() - -The runtime_suspend() callback is specific to device runtime power management -(runtime PM). It is executed by the PM core's runtime PM framework when the -device is about to be suspended (i.e. quiesced and put into a low-power state) -at run time. - -This callback is responsible for freezing the device and preparing it to be -put into a low-power state, but it must allow the PCI subsystem to perform all -of the PCI-specific actions necessary for suspending the device. - -3.1.16. runtime_resume() - -The runtime_resume() callback is specific to device runtime PM. It is executed -by the PM core's runtime PM framework when the device is about to be resumed -(i.e. put into the full-power state and programmed to process I/O normally) at -run time. - -This callback is responsible for restoring the normal functionality of the -device after it has been put into the full-power state by the PCI subsystem. -The device is expected to be able to process I/O in the usual way after -runtime_resume() has returned. - -3.1.17. runtime_idle() - -The runtime_idle() callback is specific to device runtime PM. It is executed -by the PM core's runtime PM framework whenever it may be desirable to suspend -the device according to the PM core's information. In particular, it is -automatically executed right after runtime_resume() has returned in case the -resume of the device has happened as a result of a spurious event. - -This callback is optional, but if it is not implemented or if it returns 0, the -PCI subsystem will call pm_runtime_suspend() for the device, which in turn will -cause the driver's runtime_suspend() callback to be executed. - -3.1.18. Pointing Multiple Callback Pointers to One Routine - -Although in principle each of the callbacks described in the previous -subsections can be defined as a separate function, it often is convenient to -point two or more members of struct dev_pm_ops to the same routine. There are -a few convenience macros that can be used for this purpose. - -The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one -suspend routine pointed to by the .suspend(), .freeze(), and .poweroff() -members and one resume routine pointed to by the .resume(), .thaw(), and -.restore() members. The other function pointers in this struct dev_pm_ops are -unset. - -The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it -additionally sets the .runtime_resume() pointer to the same value as -.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to -the same value as .suspend() (and .freeze() and .poweroff()). - -The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct -dev_pm_ops to indicate that one suspend routine is to be pointed to by the -.suspend(), .freeze(), and .poweroff() members and one resume routine is to -be pointed to by the .resume(), .thaw(), and .restore() members. - -3.1.19. Driver Flags for Power Management - -The PM core allows device drivers to set flags that influence the handling of -power management for the devices by the core itself and by middle layer code -including the PCI bus type. The flags should be set once at the driver probe -time with the help of the dev_pm_set_driver_flags() function and they should not -be updated directly afterwards. - -The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete -mechanism allowing device suspend/resume callbacks to be skipped if the device -is in runtime suspend when the system suspend starts. That also affects all of -the ancestors of the device, so this flag should only be used if absolutely -necessary. - -The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a -positive value from pci_pm_prepare() if the ->prepare callback provided by the -driver of the device returns a positive value. That allows the driver to opt -out from using the direct-complete mechanism dynamically. - -The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's -perspective the device can be safely left in runtime suspend during system -suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff() -to skip resuming the device from runtime suspend unless there are PCI-specific -reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(), -pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early -if the device remains in runtime suspend in the beginning of the "late" phase -of the system-wide transition under way. Moreover, if the device is in -runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime -power management status will be changed to "active" (as it is going to be put -into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(), -the function will set the power.direct_complete flag for it (to make the PM core -skip the subsequent "thaw" callbacks for it) and return. - -Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the -device to be left in suspend after system-wide transitions to the working state. -This flag is checked by the PM core, but the PCI bus type informs the PM core -which devices may be left in suspend from its perspective (that happens during -the "noirq" phase of system-wide suspend and analogous transitions) and next it -uses the dev_pm_may_skip_resume() helper to decide whether or not to return from -pci_pm_resume_noirq() early, as the PM core will skip the remaining resume -callbacks for the device during the transition under way and will set its -runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for -it. - -3.2. Device Runtime Power Management ------------------------------------- -In addition to providing device power management callbacks PCI device drivers -are responsible for controlling the runtime power management (runtime PM) of -their devices. - -The PCI device runtime PM is optional, but it is recommended that PCI device -drivers implement it at least in the cases where there is a reliable way of -verifying that the device is not used (like when the network cable is detached -from an Ethernet adapter or there are no devices attached to a USB controller). - -To support the PCI runtime PM the driver first needs to implement the -runtime_suspend() and runtime_resume() callbacks. It also may need to implement -the runtime_idle() callback to prevent the device from being suspended again -every time right after the runtime_resume() callback has returned -(alternatively, the runtime_suspend() callback will have to check if the -device should really be suspended and return -EAGAIN if that is not the case). - -The runtime PM of PCI devices is enabled by default by the PCI core. PCI -device drivers do not need to enable it and should not attempt to do so. -However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid() -helper function. In addition to that, the runtime PM usage counter of -each PCI device is incremented by local_pci_probe() before executing the -probe callback provided by the device's driver. - -If a PCI driver implements the runtime PM callbacks and intends to use the -runtime PM framework provided by the PM core and the PCI subsystem, it needs -to decrement the device's runtime PM usage counter in its probe callback -function. If it doesn't do that, the counter will always be different from -zero for the device and it will never be runtime-suspended. The simplest -way to do that is by calling pm_runtime_put_noidle(), but if the driver -wants to schedule an autosuspend right away, for example, it may call -pm_runtime_put_autosuspend() instead for this purpose. Generally, it -just needs to call a function that decrements the devices usage counter -from its probe routine to make runtime PM work for the device. - -It is important to remember that the driver's runtime_suspend() callback -may be executed right after the usage counter has been decremented, because -user space may already have caused the pm_runtime_allow() helper function -unblocking the runtime PM of the device to run via sysfs, so the driver must -be prepared to cope with that. - -The driver itself should not call pm_runtime_allow(), though. Instead, it -should let user space or some platform-specific code do that (user space can -do it via sysfs as stated above), but it must be prepared to handle the -runtime PM of the device correctly as soon as pm_runtime_allow() is called -(which may happen at any time, even before the driver is loaded). - -When the driver's remove callback runs, it has to balance the decrementation -of the device's runtime PM usage counter at the probe time. For this reason, -if it has decremented the counter in its probe callback, it must run -pm_runtime_get_noresume() in its remove callback. [Since the core carries -out a runtime resume of the device and bumps up the device's usage counter -before running the driver's remove callback, the runtime PM of the device -is effectively disabled for the duration of the remove execution and all -runtime PM helper functions incrementing the device's usage counter are -then effectively equivalent to pm_runtime_get_noresume().] - -The runtime PM framework works by processing requests to suspend or resume -devices, or to check if they are idle (in which cases it is reasonable to -subsequently request that they be suspended). These requests are represented -by work items put into the power management workqueue, pm_wq. Although there -are a few situations in which power management requests are automatically -queued by the PM core (for example, after processing a request to resume a -device the PM core automatically queues a request to check if the device is -idle), device drivers are generally responsible for queuing power management -requests for their devices. For this purpose they should use the runtime PM -helper functions provided by the PM core, discussed in -Documentation/power/runtime_pm.txt. - -Devices can also be suspended and resumed synchronously, without placing a -request into pm_wq. In the majority of cases this also is done by their -drivers that use helper functions provided by the PM core for this purpose. - -For more information on the runtime PM of devices refer to -Documentation/power/runtime_pm.txt. - - -4. Resources -============ - -PCI Local Bus Specification, Rev. 3.0 -PCI Bus Power Management Interface Specification, Rev. 1.2 -Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b -PCI Express Base Specification, Rev. 2.0 -Documentation/driver-api/pm/devices.rst -Documentation/power/runtime_pm.txt diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst new file mode 100644 index 000000000000..945fc6d760c9 --- /dev/null +++ b/Documentation/power/pm_qos_interface.rst @@ -0,0 +1,225 @@ +=============================== +PM Quality Of Service Interface +=============================== + +This interface provides a kernel and user mode interface for registering +performance expectations by drivers, subsystems and user space applications on +one of the parameters. + +Two different PM QoS frameworks are available: +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. +2. the per-device PM QoS framework provides the API to manage the per-device latency +constraints and PM QoS flags. + +Each parameters have defined units: + + * latency: usec + * timeout: usec + * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) + + +1. PM QoS framework +=================== + +The infrastructure exposes multiple misc device nodes one per implemented +parameter. The set of parameters implement is defined by pm_qos_power_init() +and pm_qos_params.h. This is done because having the available parameters +being runtime configurable or changeable from a driver was seen as too easy to +abuse. + +For each parameter a list of performance requests is maintained along with +an aggregated target value. The aggregated target value is updated with +changes to the request list or elements of the list. Typically the +aggregated target value is simply the max or min of the request values held +in the parameter list elements. +Note: the aggregated target value is implemented as an atomic variable so that +reading the aggregated value does not require any locking mechanism. + + +From kernel mode the use of this interface is simple: + +void pm_qos_add_request(handle, param_class, target_value): + Will insert an element into the list for that identified PM QoS class with the + target value. Upon change to this list the new target is recomputed and any + registered notifiers are called only if the target value is now different. + Clients of pm_qos need to save the returned handle for future use in other + pm_qos API functions. + +void pm_qos_update_request(handle, new_target_value): + Will update the list element pointed to by the handle with the new target value + and recompute the new aggregated target, calling the notification tree if the + target is changed. + +void pm_qos_remove_request(handle): + Will remove the element. After removal it will update the aggregate target and + call the notification tree if the target was changed as a result of removing + the request. + +int pm_qos_request(param_class): + Returns the aggregated value for a given PM QoS class. + +int pm_qos_request_active(handle): + Returns if the request is still active, i.e. it has not been removed from a + PM QoS class constraints list. + +int pm_qos_add_notifier(param_class, notifier): + Adds a notification callback function to the PM QoS class. The callback is + called when the aggregated value for the PM QoS class is changed. + +int pm_qos_remove_notifier(int param_class, notifier): + Removes the notification callback function for the PM QoS class. + + +From user mode: + +Only processes can register a pm_qos request. To provide for automatic +cleanup of a process, the interface requires the process to register its +parameter requests in the following way: + +To register the default pm_qos target for the specific parameter, the process +must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] + +As long as the device node is held open that process has a registered +request on the parameter. + +To change the requested target value the process needs to write an s32 value to +the open device node. Alternatively the user mode program could write a hex +string for the value using 10 char long format e.g. "0x12345678". This +translates to a pm_qos_update_request call. + +To remove the user mode request for a target value simply close the device +node. + + +2. PM QoS per-device latency and flags framework +================================================ + +For each device, there are three lists of PM QoS requests. Two of them are +maintained along with the aggregated targets of resume latency and active +state latency tolerance (in microseconds) and the third one is for PM QoS flags. +Values are updated in response to changes of the request list. + +The target values of resume latency and active state latency tolerance are +simply the minimum of the request values held in the parameter list elements. +The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements' +values. One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF. + +Note: The aggregated target values are implemented in such a way that reading +the aggregated value does not require any locking mechanism. + + +From kernel mode the use of this interface is the following: + +int dev_pm_qos_add_request(device, handle, type, value): + Will insert an element into the list for that identified device with the + target value. Upon change to this list the new target is recomputed and any + registered notifiers are called only if the target value is now different. + Clients of dev_pm_qos need to save the handle for future use in other + dev_pm_qos API functions. + +int dev_pm_qos_update_request(handle, new_value): + Will update the list element pointed to by the handle with the new target + value and recompute the new aggregated target, calling the notification + trees if the target is changed. + +int dev_pm_qos_remove_request(handle): + Will remove the element. After removal it will update the aggregate target + and call the notification trees if the target was changed as a result of + removing the request. + +s32 dev_pm_qos_read_value(device): + Returns the aggregated value for a given device's constraints list. + +enum pm_qos_flags_status dev_pm_qos_flags(device, mask) + Check PM QoS flags of the given device against the given mask of flags. + The meaning of the return values is as follows: + + PM_QOS_FLAGS_ALL: + All flags from the mask are set + PM_QOS_FLAGS_SOME: + Some flags from the mask are set + PM_QOS_FLAGS_NONE: + No flags from the mask are set + PM_QOS_FLAGS_UNDEFINED: + The device's PM QoS structure has not been initialized + or the list of requests is empty. + +int dev_pm_qos_add_ancestor_request(dev, handle, type, value) + Add a PM QoS request for the first direct ancestor of the given device whose + power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests) + or whose power.set_latency_tolerance callback pointer is not NULL (for + DEV_PM_QOS_LATENCY_TOLERANCE requests). + +int dev_pm_qos_expose_latency_limit(device, value) + Add a request to the device's PM QoS list of resume latency constraints and + create a sysfs attribute pm_qos_resume_latency_us under the device's power + directory allowing user space to manipulate that request. + +void dev_pm_qos_hide_latency_limit(device) + Drop the request added by dev_pm_qos_expose_latency_limit() from the device's + PM QoS list of resume latency constraints and remove sysfs attribute + pm_qos_resume_latency_us from the device's power directory. + +int dev_pm_qos_expose_flags(device, value) + Add a request to the device's PM QoS list of flags and create sysfs attribute + pm_qos_no_power_off under the device's power directory allowing user space to + change the value of the PM_QOS_FLAG_NO_POWER_OFF flag. + +void dev_pm_qos_hide_flags(device) + Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list + of flags and remove sysfs attribute pm_qos_no_power_off from the device's power + directory. + +Notification mechanisms: + +The per-device PM QoS framework has a per-device notification tree. + +int dev_pm_qos_add_notifier(device, notifier): + Adds a notification callback function for the device. + The callback is called when the aggregated value of the device constraints list + is changed (for resume latency device PM QoS only). + +int dev_pm_qos_remove_notifier(device, notifier): + Removes the notification callback function for the device. + + +Active state latency tolerance +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This device PM QoS type is used to support systems in which hardware may switch +to energy-saving operation modes on the fly. In those systems, if the operation +mode chosen by the hardware attempts to save energy in an overly aggressive way, +it may cause excess latencies to be visible to software, causing it to miss +certain protocol requirements or target frame or sample rates etc. + +If there is a latency tolerance control mechanism for a given device available +to software, the .set_latency_tolerance callback in that device's dev_pm_info +structure should be populated. The routine pointed to by it is should implement +whatever is necessary to transfer the effective requirement value to the +hardware. + +Whenever the effective latency tolerance changes for the device, its +.set_latency_tolerance() callback will be executed and the effective value will +be passed to it. If that value is negative, which means that the list of +latency tolerance requirements for the device is empty, the callback is expected +to switch the underlying hardware latency tolerance control mechanism to an +autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and +the hardware supports a special "no requirement" setting, the callback is +expected to use it. That allows software to prevent the hardware from +automatically updating the device's latency tolerance in response to its power +state changes (e.g. during transitions from D3cold to D0), which generally may +be done in the autonomous latency tolerance control mode. + +If .set_latency_tolerance() is present for the device, sysfs attribute +pm_qos_latency_tolerance_us will be present in the devivce's power directory. +Then, user space can use that attribute to specify its latency tolerance +requirement for the device, if any. Writing "any" to it means "no requirement, +but do not let the hardware control latency tolerance" and writing "auto" to it +allows the hardware to be switched to the autonomous mode if there are no other +requirements from the kernel side in the device's list. + +Kernel code can use the functions described above along with the +DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update +latency tolerance requirements for devices. diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt deleted file mode 100644 index 19c5f7b1a7ba..000000000000 --- a/Documentation/power/pm_qos_interface.txt +++ /dev/null @@ -1,212 +0,0 @@ -PM Quality Of Service Interface. - -This interface provides a kernel and user mode interface for registering -performance expectations by drivers, subsystems and user space applications on -one of the parameters. - -Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, -memory_bandwidth. -2. the per-device PM QoS framework provides the API to manage the per-device latency -constraints and PM QoS flags. - -Each parameters have defined units: - * latency: usec - * timeout: usec - * throughput: kbs (kilo bit / sec) - * memory bandwidth: mbs (mega bit / sec) - - -1. PM QoS framework - -The infrastructure exposes multiple misc device nodes one per implemented -parameter. The set of parameters implement is defined by pm_qos_power_init() -and pm_qos_params.h. This is done because having the available parameters -being runtime configurable or changeable from a driver was seen as too easy to -abuse. - -For each parameter a list of performance requests is maintained along with -an aggregated target value. The aggregated target value is updated with -changes to the request list or elements of the list. Typically the -aggregated target value is simply the max or min of the request values held -in the parameter list elements. -Note: the aggregated target value is implemented as an atomic variable so that -reading the aggregated value does not require any locking mechanism. - - -From kernel mode the use of this interface is simple: - -void pm_qos_add_request(handle, param_class, target_value): -Will insert an element into the list for that identified PM QoS class with the -target value. Upon change to this list the new target is recomputed and any -registered notifiers are called only if the target value is now different. -Clients of pm_qos need to save the returned handle for future use in other -pm_qos API functions. - -void pm_qos_update_request(handle, new_target_value): -Will update the list element pointed to by the handle with the new target value -and recompute the new aggregated target, calling the notification tree if the -target is changed. - -void pm_qos_remove_request(handle): -Will remove the element. After removal it will update the aggregate target and -call the notification tree if the target was changed as a result of removing -the request. - -int pm_qos_request(param_class): -Returns the aggregated value for a given PM QoS class. - -int pm_qos_request_active(handle): -Returns if the request is still active, i.e. it has not been removed from a -PM QoS class constraints list. - -int pm_qos_add_notifier(param_class, notifier): -Adds a notification callback function to the PM QoS class. The callback is -called when the aggregated value for the PM QoS class is changed. - -int pm_qos_remove_notifier(int param_class, notifier): -Removes the notification callback function for the PM QoS class. - - -From user mode: -Only processes can register a pm_qos request. To provide for automatic -cleanup of a process, the interface requires the process to register its -parameter requests in the following way: - -To register the default pm_qos target for the specific parameter, the process -must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] - -As long as the device node is held open that process has a registered -request on the parameter. - -To change the requested target value the process needs to write an s32 value to -the open device node. Alternatively the user mode program could write a hex -string for the value using 10 char long format e.g. "0x12345678". This -translates to a pm_qos_update_request call. - -To remove the user mode request for a target value simply close the device -node. - - -2. PM QoS per-device latency and flags framework - -For each device, there are three lists of PM QoS requests. Two of them are -maintained along with the aggregated targets of resume latency and active -state latency tolerance (in microseconds) and the third one is for PM QoS flags. -Values are updated in response to changes of the request list. - -The target values of resume latency and active state latency tolerance are -simply the minimum of the request values held in the parameter list elements. -The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements' -values. One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF. - -Note: The aggregated target values are implemented in such a way that reading -the aggregated value does not require any locking mechanism. - - -From kernel mode the use of this interface is the following: - -int dev_pm_qos_add_request(device, handle, type, value): -Will insert an element into the list for that identified device with the -target value. Upon change to this list the new target is recomputed and any -registered notifiers are called only if the target value is now different. -Clients of dev_pm_qos need to save the handle for future use in other -dev_pm_qos API functions. - -int dev_pm_qos_update_request(handle, new_value): -Will update the list element pointed to by the handle with the new target value -and recompute the new aggregated target, calling the notification trees if the -target is changed. - -int dev_pm_qos_remove_request(handle): -Will remove the element. After removal it will update the aggregate target and -call the notification trees if the target was changed as a result of removing -the request. - -s32 dev_pm_qos_read_value(device): -Returns the aggregated value for a given device's constraints list. - -enum pm_qos_flags_status dev_pm_qos_flags(device, mask) -Check PM QoS flags of the given device against the given mask of flags. -The meaning of the return values is as follows: - PM_QOS_FLAGS_ALL: All flags from the mask are set - PM_QOS_FLAGS_SOME: Some flags from the mask are set - PM_QOS_FLAGS_NONE: No flags from the mask are set - PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been - initialized or the list of requests is empty. - -int dev_pm_qos_add_ancestor_request(dev, handle, type, value) -Add a PM QoS request for the first direct ancestor of the given device whose -power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests) -or whose power.set_latency_tolerance callback pointer is not NULL (for -DEV_PM_QOS_LATENCY_TOLERANCE requests). - -int dev_pm_qos_expose_latency_limit(device, value) -Add a request to the device's PM QoS list of resume latency constraints and -create a sysfs attribute pm_qos_resume_latency_us under the device's power -directory allowing user space to manipulate that request. - -void dev_pm_qos_hide_latency_limit(device) -Drop the request added by dev_pm_qos_expose_latency_limit() from the device's -PM QoS list of resume latency constraints and remove sysfs attribute -pm_qos_resume_latency_us from the device's power directory. - -int dev_pm_qos_expose_flags(device, value) -Add a request to the device's PM QoS list of flags and create sysfs attribute -pm_qos_no_power_off under the device's power directory allowing user space to -change the value of the PM_QOS_FLAG_NO_POWER_OFF flag. - -void dev_pm_qos_hide_flags(device) -Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list -of flags and remove sysfs attribute pm_qos_no_power_off from the device's power -directory. - -Notification mechanisms: -The per-device PM QoS framework has a per-device notification tree. - -int dev_pm_qos_add_notifier(device, notifier): -Adds a notification callback function for the device. -The callback is called when the aggregated value of the device constraints list -is changed (for resume latency device PM QoS only). - -int dev_pm_qos_remove_notifier(device, notifier): -Removes the notification callback function for the device. - - -Active state latency tolerance - -This device PM QoS type is used to support systems in which hardware may switch -to energy-saving operation modes on the fly. In those systems, if the operation -mode chosen by the hardware attempts to save energy in an overly aggressive way, -it may cause excess latencies to be visible to software, causing it to miss -certain protocol requirements or target frame or sample rates etc. - -If there is a latency tolerance control mechanism for a given device available -to software, the .set_latency_tolerance callback in that device's dev_pm_info -structure should be populated. The routine pointed to by it is should implement -whatever is necessary to transfer the effective requirement value to the -hardware. - -Whenever the effective latency tolerance changes for the device, its -.set_latency_tolerance() callback will be executed and the effective value will -be passed to it. If that value is negative, which means that the list of -latency tolerance requirements for the device is empty, the callback is expected -to switch the underlying hardware latency tolerance control mechanism to an -autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and -the hardware supports a special "no requirement" setting, the callback is -expected to use it. That allows software to prevent the hardware from -automatically updating the device's latency tolerance in response to its power -state changes (e.g. during transitions from D3cold to D0), which generally may -be done in the autonomous latency tolerance control mode. - -If .set_latency_tolerance() is present for the device, sysfs attribute -pm_qos_latency_tolerance_us will be present in the devivce's power directory. -Then, user space can use that attribute to specify its latency tolerance -requirement for the device, if any. Writing "any" to it means "no requirement, -but do not let the hardware control latency tolerance" and writing "auto" to it -allows the hardware to be switched to the autonomous mode if there are no other -requirements from the kernel side in the device's list. - -Kernel code can use the functions described above along with the -DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update -latency tolerance requirements for devices. diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst new file mode 100644 index 000000000000..3f2c3fe38a61 --- /dev/null +++ b/Documentation/power/power_supply_class.rst @@ -0,0 +1,282 @@ +======================== +Linux power supply class +======================== + +Synopsis +~~~~~~~~ +Power supply class used to represent battery, UPS, AC or DC power supply +properties to user-space. + +It defines core set of attributes, which should be applicable to (almost) +every power supply out there. Attributes are available via sysfs and uevent +interfaces. + +Each attribute has well defined meaning, up to unit of measure used. While +the attributes provided are believed to be universally applicable to any +power supply, specific monitoring hardware may not be able to provide them +all, so any of them may be skipped. + +Power supply class is extensible, and allows to define drivers own attributes. +The core attribute set is subject to the standard Linux evolution (i.e. +if it will be found that some attribute is applicable to many power supply +types or their drivers, it can be added to the core set). + +It also integrates with LED framework, for the purpose of providing +typically expected feedback of battery charging/fully charged status and +AC/USB power supply online status. (Note that specific details of the +indication (including whether to use it at all) are fully controllable by +user and/or specific machine defaults, per design principles of LED +framework). + + +Attributes/properties +~~~~~~~~~~~~~~~~~~~~~ +Power supply class has predefined set of attributes, this eliminates code +duplication across drivers. Power supply class insist on reusing its +predefined attributes *and* their units. + +So, userspace gets predictable set of attributes and their units for any +kind of power supply, and can process/present them to a user in consistent +manner. Results for different power supplies and machines are also directly +comparable. + +See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c +for the example how to declare and handle attributes. + + +Units +~~~~~ +Quoting include/linux/power_supply.h: + + All voltages, currents, charges, energies, time and temperatures in µV, + µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise + stated. It's driver's job to convert its raw values to units in which + this class operates. + + +Attributes/properties detailed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++--------------------------------------------------------------------------+ +| **Charge/Energy/Capacity - how to not confuse** | ++--------------------------------------------------------------------------+ +| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity" | +| of battery, this class distinguish these terms. Don't mix them!** | +| | +| - `CHARGE_*` | +| attributes represents capacity in µAh only. | +| - `ENERGY_*` | +| attributes represents capacity in µWh only. | +| - `CAPACITY` | +| attribute represents capacity in *percents*, from 0 to 100. | ++--------------------------------------------------------------------------+ + +Postfixes: + +_AVG + *hardware* averaged value, use it if your hardware is really able to + report averaged values. +_NOW + momentary/instantaneous values. + +STATUS + this attribute represents operating status (charging, full, + discharging (i.e. powering a load), etc.). This corresponds to + `BATTERY_STATUS_*` values, as defined in battery.h. + +CHARGE_TYPE + batteries can typically charge at different rates. + This defines trickle and fast charges. For batteries that + are already charged or discharging, 'n/a' can be displayed (or + 'unknown', if the status is not known). + +AUTHENTIC + indicates the power supply (battery or charger) connected + to the platform is authentic(1) or non authentic(0). + +HEALTH + represents health of the battery, values corresponds to + POWER_SUPPLY_HEALTH_*, defined in battery.h. + +VOLTAGE_OCV + open circuit voltage of the battery. + +VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN + design values for maximal and minimal power supply voltages. + Maximal/minimal means values of voltages when battery considered + "full"/"empty" at normal conditions. Yes, there is no direct relation + between voltage and battery capacity, but some dumb + batteries use voltage for very approximated calculation of capacity. + Battery driver also can use this attribute just to inform userspace + about maximal and minimal voltage thresholds of a given battery. + +VOLTAGE_MAX, VOLTAGE_MIN + same as _DESIGN voltage values except that these ones should be used + if hardware could only guess (measure and retain) the thresholds of a + given power supply. + +VOLTAGE_BOOT + Reports the voltage measured during boot + +CURRENT_BOOT + Reports the current measured during boot + +CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN + design charge values, when battery considered full/empty. + +ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN + same as above but for energy. + +CHARGE_FULL, CHARGE_EMPTY + These attributes means "last remembered value of charge when battery + became full/empty". It also could mean "value of charge when battery + considered full/empty at given conditions (temperature, age)". + I.e. these attributes represents real thresholds, not design values. + +ENERGY_FULL, ENERGY_EMPTY + same as above but for energy. + +CHARGE_COUNTER + the current charge counter (in µAh). This could easily + be negative; there is no empty or full value. It is only useful for + relative, time-based measurements. + +PRECHARGE_CURRENT + the maximum charge current during precharge phase of charge cycle + (typically 20% of battery capacity). + +CHARGE_TERM_CURRENT + Charge termination current. The charge cycle terminates when battery + voltage is above recharge threshold, and charge current is below + this setting (typically 10% of battery capacity). + +CONSTANT_CHARGE_CURRENT + constant charge current programmed by charger. + + +CONSTANT_CHARGE_CURRENT_MAX + maximum charge current supported by the power supply object. + +CONSTANT_CHARGE_VOLTAGE + constant charge voltage programmed by charger. +CONSTANT_CHARGE_VOLTAGE_MAX + maximum charge voltage supported by the power supply object. + +INPUT_CURRENT_LIMIT + input current limit programmed by charger. Indicates + the current drawn from a charging source. + +CHARGE_CONTROL_LIMIT + current charge control limit setting +CHARGE_CONTROL_LIMIT_MAX + maximum charge control limit setting + +CALIBRATE + battery or coulomb counter calibration status + +CAPACITY + capacity in percents. +CAPACITY_ALERT_MIN + minimum capacity alert value in percents. +CAPACITY_ALERT_MAX + maximum capacity alert value in percents. +CAPACITY_LEVEL + capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*. + +TEMP + temperature of the power supply. +TEMP_ALERT_MIN + minimum battery temperature alert. +TEMP_ALERT_MAX + maximum battery temperature alert. +TEMP_AMBIENT + ambient temperature. +TEMP_AMBIENT_ALERT_MIN + minimum ambient temperature alert. +TEMP_AMBIENT_ALERT_MAX + maximum ambient temperature alert. +TEMP_MIN + minimum operatable temperature +TEMP_MAX + maximum operatable temperature + +TIME_TO_EMPTY + seconds left for battery to be considered empty + (i.e. while battery powers a load) +TIME_TO_FULL + seconds left for battery to be considered full + (i.e. while battery is charging) + + +Battery <-> external power supply interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Often power supplies are acting as supplies and supplicants at the same +time. Batteries are good example. So, batteries usually care if they're +externally powered or not. + +For that case, power supply class implements notification mechanism for +batteries. + +External power supply (AC) lists supplicants (batteries) names in +"supplied_to" struct member, and each power_supply_changed() call +issued by external power supply will notify supplicants via +external_power_changed callback. + + +Devicetree battery characteristics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Drivers should call power_supply_get_battery_info() to obtain battery +characteristics from a devicetree battery node, defined in +Documentation/devicetree/bindings/power/supply/battery.txt. This is +implemented in drivers/power/supply/bq27xxx_battery.c. + +Properties in struct power_supply_battery_info and their counterparts in the +battery node have names corresponding to elements in enum power_supply_property, +for naming consistency between sysfs attributes and battery node properties. + + +QA +~~ + +Q: + Where is POWER_SUPPLY_PROP_XYZ attribute? +A: + If you cannot find attribute suitable for your driver needs, feel free + to add it and send patch along with your driver. + + The attributes available currently are the ones currently provided by the + drivers written. + + Good candidates to add in future: model/part#, cycle_time, manufacturer, + etc. + + +Q: + I have some very specific attribute (e.g. battery color), should I add + this attribute to standard ones? +A: + Most likely, no. Such attribute can be placed in the driver itself, if + it is useful. Of course, if the attribute in question applicable to + large set of batteries, provided by many drivers, and/or comes from + some general battery specification/standard, it may be a candidate to + be added to the core attribute set. + + +Q: + Suppose, my battery monitoring chip/firmware does not provides capacity + in percents, but provides charge_{now,full,empty}. Should I calculate + percentage capacity manually, inside the driver, and register CAPACITY + attribute? The same question about time_to_empty/time_to_full. +A: + Most likely, no. This class is designed to export properties which are + directly measurable by the specific hardware available. + + Inferring not available properties using some heuristics or mathematical + model is not subject of work for a battery driver. Such functionality + should be factored out, and in fact, apm_power, the driver to serve + legacy APM API on top of power supply class, uses a simple heuristic of + approximating remaining battery capacity based on its charge, current, + voltage and so on. But full-fledged battery model is likely not subject + for kernel at all, as it would require floating point calculation to deal + with things like differential equations and Kalman filters. This is + better be handled by batteryd/libbattery, yet to be written. diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt deleted file mode 100644 index 300d37896e51..000000000000 --- a/Documentation/power/power_supply_class.txt +++ /dev/null @@ -1,231 +0,0 @@ -Linux power supply class -======================== - -Synopsis -~~~~~~~~ -Power supply class used to represent battery, UPS, AC or DC power supply -properties to user-space. - -It defines core set of attributes, which should be applicable to (almost) -every power supply out there. Attributes are available via sysfs and uevent -interfaces. - -Each attribute has well defined meaning, up to unit of measure used. While -the attributes provided are believed to be universally applicable to any -power supply, specific monitoring hardware may not be able to provide them -all, so any of them may be skipped. - -Power supply class is extensible, and allows to define drivers own attributes. -The core attribute set is subject to the standard Linux evolution (i.e. -if it will be found that some attribute is applicable to many power supply -types or their drivers, it can be added to the core set). - -It also integrates with LED framework, for the purpose of providing -typically expected feedback of battery charging/fully charged status and -AC/USB power supply online status. (Note that specific details of the -indication (including whether to use it at all) are fully controllable by -user and/or specific machine defaults, per design principles of LED -framework). - - -Attributes/properties -~~~~~~~~~~~~~~~~~~~~~ -Power supply class has predefined set of attributes, this eliminates code -duplication across drivers. Power supply class insist on reusing its -predefined attributes *and* their units. - -So, userspace gets predictable set of attributes and their units for any -kind of power supply, and can process/present them to a user in consistent -manner. Results for different power supplies and machines are also directly -comparable. - -See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c -for the example how to declare and handle attributes. - - -Units -~~~~~ -Quoting include/linux/power_supply.h: - - All voltages, currents, charges, energies, time and temperatures in µV, - µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise - stated. It's driver's job to convert its raw values to units in which - this class operates. - - -Attributes/properties detailed -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -~ ~ ~ ~ ~ ~ ~ Charge/Energy/Capacity - how to not confuse ~ ~ ~ ~ ~ ~ ~ -~ ~ -~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity" ~ -~ of battery, this class distinguish these terms. Don't mix them! ~ -~ ~ -~ CHARGE_* attributes represents capacity in µAh only. ~ -~ ENERGY_* attributes represents capacity in µWh only. ~ -~ CAPACITY attribute represents capacity in *percents*, from 0 to 100. ~ -~ ~ -~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ - -Postfixes: -_AVG - *hardware* averaged value, use it if your hardware is really able to -report averaged values. -_NOW - momentary/instantaneous values. - -STATUS - this attribute represents operating status (charging, full, -discharging (i.e. powering a load), etc.). This corresponds to -BATTERY_STATUS_* values, as defined in battery.h. - -CHARGE_TYPE - batteries can typically charge at different rates. -This defines trickle and fast charges. For batteries that -are already charged or discharging, 'n/a' can be displayed (or -'unknown', if the status is not known). - -AUTHENTIC - indicates the power supply (battery or charger) connected -to the platform is authentic(1) or non authentic(0). - -HEALTH - represents health of the battery, values corresponds to -POWER_SUPPLY_HEALTH_*, defined in battery.h. - -VOLTAGE_OCV - open circuit voltage of the battery. - -VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and -minimal power supply voltages. Maximal/minimal means values of voltages -when battery considered "full"/"empty" at normal conditions. Yes, there is -no direct relation between voltage and battery capacity, but some dumb -batteries use voltage for very approximated calculation of capacity. -Battery driver also can use this attribute just to inform userspace -about maximal and minimal voltage thresholds of a given battery. - -VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that -these ones should be used if hardware could only guess (measure and -retain) the thresholds of a given power supply. - -VOLTAGE_BOOT - Reports the voltage measured during boot - -CURRENT_BOOT - Reports the current measured during boot - -CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when -battery considered full/empty. - -ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy. - -CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value -of charge when battery became full/empty". It also could mean "value of -charge when battery considered full/empty at given conditions (temperature, -age)". I.e. these attributes represents real thresholds, not design values. - -ENERGY_FULL, ENERGY_EMPTY - same as above but for energy. - -CHARGE_COUNTER - the current charge counter (in µAh). This could easily -be negative; there is no empty or full value. It is only useful for -relative, time-based measurements. - -PRECHARGE_CURRENT - the maximum charge current during precharge phase -of charge cycle (typically 20% of battery capacity). -CHARGE_TERM_CURRENT - Charge termination current. The charge cycle -terminates when battery voltage is above recharge threshold, and charge -current is below this setting (typically 10% of battery capacity). - -CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger. -CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the -power supply object. - -CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger. -CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the -power supply object. - -INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates -the current drawn from a charging source. - -CHARGE_CONTROL_LIMIT - current charge control limit setting -CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting - -CALIBRATE - battery or coulomb counter calibration status - -CAPACITY - capacity in percents. -CAPACITY_ALERT_MIN - minimum capacity alert value in percents. -CAPACITY_ALERT_MAX - maximum capacity alert value in percents. -CAPACITY_LEVEL - capacity level. This corresponds to -POWER_SUPPLY_CAPACITY_LEVEL_*. - -TEMP - temperature of the power supply. -TEMP_ALERT_MIN - minimum battery temperature alert. -TEMP_ALERT_MAX - maximum battery temperature alert. -TEMP_AMBIENT - ambient temperature. -TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert. -TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert. -TEMP_MIN - minimum operatable temperature -TEMP_MAX - maximum operatable temperature - -TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e. -while battery powers a load) -TIME_TO_FULL - seconds left for battery to be considered full (i.e. -while battery is charging) - - -Battery <-> external power supply interaction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Often power supplies are acting as supplies and supplicants at the same -time. Batteries are good example. So, batteries usually care if they're -externally powered or not. - -For that case, power supply class implements notification mechanism for -batteries. - -External power supply (AC) lists supplicants (batteries) names in -"supplied_to" struct member, and each power_supply_changed() call -issued by external power supply will notify supplicants via -external_power_changed callback. - - -Devicetree battery characteristics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Drivers should call power_supply_get_battery_info() to obtain battery -characteristics from a devicetree battery node, defined in -Documentation/devicetree/bindings/power/supply/battery.txt. This is -implemented in drivers/power/supply/bq27xxx_battery.c. - -Properties in struct power_supply_battery_info and their counterparts in the -battery node have names corresponding to elements in enum power_supply_property, -for naming consistency between sysfs attributes and battery node properties. - - -QA -~~ -Q: Where is POWER_SUPPLY_PROP_XYZ attribute? -A: If you cannot find attribute suitable for your driver needs, feel free - to add it and send patch along with your driver. - - The attributes available currently are the ones currently provided by the - drivers written. - - Good candidates to add in future: model/part#, cycle_time, manufacturer, - etc. - - -Q: I have some very specific attribute (e.g. battery color), should I add - this attribute to standard ones? -A: Most likely, no. Such attribute can be placed in the driver itself, if - it is useful. Of course, if the attribute in question applicable to - large set of batteries, provided by many drivers, and/or comes from - some general battery specification/standard, it may be a candidate to - be added to the core attribute set. - - -Q: Suppose, my battery monitoring chip/firmware does not provides capacity - in percents, but provides charge_{now,full,empty}. Should I calculate - percentage capacity manually, inside the driver, and register CAPACITY - attribute? The same question about time_to_empty/time_to_full. -A: Most likely, no. This class is designed to export properties which are - directly measurable by the specific hardware available. - - Inferring not available properties using some heuristics or mathematical - model is not subject of work for a battery driver. Such functionality - should be factored out, and in fact, apm_power, the driver to serve - legacy APM API on top of power supply class, uses a simple heuristic of - approximating remaining battery capacity based on its charge, current, - voltage and so on. But full-fledged battery model is likely not subject - for kernel at all, as it would require floating point calculation to deal - with things like differential equations and Kalman filters. This is - better be handled by batteryd/libbattery, yet to be written. diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst new file mode 100644 index 000000000000..7ae3b44c7624 --- /dev/null +++ b/Documentation/power/powercap/powercap.rst @@ -0,0 +1,257 @@ +======================= +Power Capping Framework +======================= + +The power capping framework provides a consistent interface between the kernel +and the user space that allows power capping drivers to expose the settings to +user space in a uniform way. + +Terminology +=========== + +The framework exposes power capping devices to user space via sysfs in the +form of a tree of objects. The objects at the root level of the tree represent +'control types', which correspond to different methods of power capping. For +example, the intel-rapl control type represents the Intel "Running Average +Power Limit" (RAPL) technology, whereas the 'idle-injection' control type +corresponds to the use of idle injection for controlling power. + +Power zones represent different parts of the system, which can be controlled and +monitored using the power capping method determined by the control type the +given zone belongs to. They each contain attributes for monitoring power, as +well as controls represented in the form of power constraints. If the parts of +the system represented by different power zones are hierarchical (that is, one +bigger part consists of multiple smaller parts that each have their own power +controls), those power zones may also be organized in a hierarchy with one +parent power zone containing multiple subzones and so on to reflect the power +control topology of the system. In that case, it is possible to apply power +capping to a set of devices together using the parent power zone and if more +fine grained control is required, it can be applied through the subzones. + + +Example sysfs interface tree:: + + /sys/devices/virtual/powercap + └──intel-rapl + ├──intel-rapl:0 + │   ├──constraint_0_name + │   ├──constraint_0_power_limit_uw + │   ├──constraint_0_time_window_us + │   ├──constraint_1_name + │   ├──constraint_1_power_limit_uw + │   ├──constraint_1_time_window_us + │   ├──device -> ../../intel-rapl + │   ├──energy_uj + │   ├──intel-rapl:0:0 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:0 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──intel-rapl:0:1 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:0 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──max_energy_range_uj + │   ├──max_power_range_uw + │   ├──name + │   ├──enabled + │   ├──power + │   │   ├──async + │   │   [] + │   ├──subsystem -> ../../../../../class/power_cap + │   ├──enabled + │   ├──uevent + ├──intel-rapl:1 + │   ├──constraint_0_name + │   ├──constraint_0_power_limit_uw + │   ├──constraint_0_time_window_us + │   ├──constraint_1_name + │   ├──constraint_1_power_limit_uw + │   ├──constraint_1_time_window_us + │   ├──device -> ../../intel-rapl + │   ├──energy_uj + │   ├──intel-rapl:1:0 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:1 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──intel-rapl:1:1 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:1 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──max_energy_range_uj + │   ├──max_power_range_uw + │   ├──name + │   ├──enabled + │   ├──power + │   │   ├──async + │   │   [] + │   ├──subsystem -> ../../../../../class/power_cap + │   ├──uevent + ├──power + │   ├──async + │   [] + ├──subsystem -> ../../../../class/power_cap + ├──enabled + └──uevent + +The above example illustrates a case in which the Intel RAPL technology, +available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one +control type called intel-rapl which contains two power zones, intel-rapl:0 and +intel-rapl:1, representing CPU packages. Each of these power zones contains +two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the +"core" and the "uncore" parts of the given CPU package, respectively. All of +the zones and subzones contain energy monitoring attributes (energy_uj, +max_energy_range_uj) and constraint attributes (constraint_*) allowing controls +to be applied (the constraints in the 'package' power zones apply to the whole +CPU packages and the subzone constraints only apply to the respective parts of +the given package individually). Since Intel RAPL doesn't provide instantaneous +power value, there is no power_uw attribute. + +In addition to that, each power zone contains a name attribute, allowing the +part of the system represented by that zone to be identified. +For example:: + + cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name + +package-0 +--------- + +The Intel RAPL technology allows two constraints, short term and long term, +with two different time windows to be applied to each power zone. Thus for +each zone there are 2 attributes representing the constraint names, 2 power +limits and 2 attributes representing the sizes of the time windows. Such that, +constraint_j_* attributes correspond to the jth constraint (j = 0,1). + +For example:: + + constraint_0_name + constraint_0_power_limit_uw + constraint_0_time_window_us + constraint_1_name + constraint_1_power_limit_uw + constraint_1_time_window_us + +Power Zone Attributes +===================== + +Monitoring attributes +--------------------- + +energy_uj (rw) + Current energy counter in micro joules. Write "0" to reset. + If the counter can not be reset, then this attribute is read only. + +max_energy_range_uj (ro) + Range of the above energy counter in micro-joules. + +power_uw (ro) + Current power in micro watts. + +max_power_range_uw (ro) + Range of the above power value in micro-watts. + +name (ro) + Name of this power zone. + +It is possible that some domains have both power ranges and energy counter ranges; +however, only one is mandatory. + +Constraints +----------- + +constraint_X_power_limit_uw (rw) + Power limit in micro watts, which should be applicable for the + time window specified by "constraint_X_time_window_us". + +constraint_X_time_window_us (rw) + Time window in micro seconds. + +constraint_X_name (ro) + An optional name of the constraint + +constraint_X_max_power_uw(ro) + Maximum allowed power in micro watts. + +constraint_X_min_power_uw(ro) + Minimum allowed power in micro watts. + +constraint_X_max_time_window_us(ro) + Maximum allowed time window in micro seconds. + +constraint_X_min_time_window_us(ro) + Minimum allowed time window in micro seconds. + +Except power_limit_uw and time_window_us other fields are optional. + +Common zone and control type attributes +--------------------------------------- + +enabled (rw): Enable/Disable controls at zone level or for all zones using +a control type. + +Power Cap Client Driver Interface +================================= + +The API summary: + +Call powercap_register_control_type() to register control type object. +Call powercap_register_zone() to register a power zone (under a given +control type), either as a top-level power zone or as a subzone of another +power zone registered earlier. +The number of constraints in a power zone and the corresponding callbacks have +to be defined prior to calling powercap_register_zone() to register that zone. + +To Free a power zone call powercap_unregister_zone(). +To free a control type object call powercap_unregister_control_type(). +Detailed API can be generated using kernel-doc on include/linux/powercap.h. diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt deleted file mode 100644 index 1e6ef164e07a..000000000000 --- a/Documentation/power/powercap/powercap.txt +++ /dev/null @@ -1,236 +0,0 @@ -Power Capping Framework -================================== - -The power capping framework provides a consistent interface between the kernel -and the user space that allows power capping drivers to expose the settings to -user space in a uniform way. - -Terminology -========================= -The framework exposes power capping devices to user space via sysfs in the -form of a tree of objects. The objects at the root level of the tree represent -'control types', which correspond to different methods of power capping. For -example, the intel-rapl control type represents the Intel "Running Average -Power Limit" (RAPL) technology, whereas the 'idle-injection' control type -corresponds to the use of idle injection for controlling power. - -Power zones represent different parts of the system, which can be controlled and -monitored using the power capping method determined by the control type the -given zone belongs to. They each contain attributes for monitoring power, as -well as controls represented in the form of power constraints. If the parts of -the system represented by different power zones are hierarchical (that is, one -bigger part consists of multiple smaller parts that each have their own power -controls), those power zones may also be organized in a hierarchy with one -parent power zone containing multiple subzones and so on to reflect the power -control topology of the system. In that case, it is possible to apply power -capping to a set of devices together using the parent power zone and if more -fine grained control is required, it can be applied through the subzones. - - -Example sysfs interface tree: - -/sys/devices/virtual/powercap -??? intel-rapl - ??? intel-rapl:0 - ?   ??? constraint_0_name - ?   ??? constraint_0_power_limit_uw - ?   ??? constraint_0_time_window_us - ?   ??? constraint_1_name - ?   ??? constraint_1_power_limit_uw - ?   ??? constraint_1_time_window_us - ?   ??? device -> ../../intel-rapl - ?   ??? energy_uj - ?   ??? intel-rapl:0:0 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:0 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? intel-rapl:0:1 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:0 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? max_energy_range_uj - ?   ??? max_power_range_uw - ?   ??? name - ?   ??? enabled - ?   ??? power - ?   ?   ??? async - ?   ?   [] - ?   ??? subsystem -> ../../../../../class/power_cap - ?   ??? enabled - ?   ??? uevent - ??? intel-rapl:1 - ?   ??? constraint_0_name - ?   ??? constraint_0_power_limit_uw - ?   ??? constraint_0_time_window_us - ?   ??? constraint_1_name - ?   ??? constraint_1_power_limit_uw - ?   ??? constraint_1_time_window_us - ?   ??? device -> ../../intel-rapl - ?   ??? energy_uj - ?   ??? intel-rapl:1:0 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:1 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? intel-rapl:1:1 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:1 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? max_energy_range_uj - ?   ??? max_power_range_uw - ?   ??? name - ?   ??? enabled - ?   ??? power - ?   ?   ??? async - ?   ?   [] - ?   ??? subsystem -> ../../../../../class/power_cap - ?   ??? uevent - ??? power - ?   ??? async - ?   [] - ??? subsystem -> ../../../../class/power_cap - ??? enabled - ??? uevent - -The above example illustrates a case in which the Intel RAPL technology, -available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one -control type called intel-rapl which contains two power zones, intel-rapl:0 and -intel-rapl:1, representing CPU packages. Each of these power zones contains -two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the -"core" and the "uncore" parts of the given CPU package, respectively. All of -the zones and subzones contain energy monitoring attributes (energy_uj, -max_energy_range_uj) and constraint attributes (constraint_*) allowing controls -to be applied (the constraints in the 'package' power zones apply to the whole -CPU packages and the subzone constraints only apply to the respective parts of -the given package individually). Since Intel RAPL doesn't provide instantaneous -power value, there is no power_uw attribute. - -In addition to that, each power zone contains a name attribute, allowing the -part of the system represented by that zone to be identified. -For example: - -cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name -package-0 - -The Intel RAPL technology allows two constraints, short term and long term, -with two different time windows to be applied to each power zone. Thus for -each zone there are 2 attributes representing the constraint names, 2 power -limits and 2 attributes representing the sizes of the time windows. Such that, -constraint_j_* attributes correspond to the jth constraint (j = 0,1). - -For example: - constraint_0_name - constraint_0_power_limit_uw - constraint_0_time_window_us - constraint_1_name - constraint_1_power_limit_uw - constraint_1_time_window_us - -Power Zone Attributes -================================= -Monitoring attributes ----------------------- - -energy_uj (rw): Current energy counter in micro joules. Write "0" to reset. -If the counter can not be reset, then this attribute is read only. - -max_energy_range_uj (ro): Range of the above energy counter in micro-joules. - -power_uw (ro): Current power in micro watts. - -max_power_range_uw (ro): Range of the above power value in micro-watts. - -name (ro): Name of this power zone. - -It is possible that some domains have both power ranges and energy counter ranges; -however, only one is mandatory. - -Constraints ----------------- -constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be -applicable for the time window specified by "constraint_X_time_window_us". - -constraint_X_time_window_us (rw): Time window in micro seconds. - -constraint_X_name (ro): An optional name of the constraint - -constraint_X_max_power_uw(ro): Maximum allowed power in micro watts. - -constraint_X_min_power_uw(ro): Minimum allowed power in micro watts. - -constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds. - -constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds. - -Except power_limit_uw and time_window_us other fields are optional. - -Common zone and control type attributes ----------------------------------------- -enabled (rw): Enable/Disable controls at zone level or for all zones using -a control type. - -Power Cap Client Driver Interface -================================== -The API summary: - -Call powercap_register_control_type() to register control type object. -Call powercap_register_zone() to register a power zone (under a given -control type), either as a top-level power zone or as a subzone of another -power zone registered earlier. -The number of constraints in a power zone and the corresponding callbacks have -to be defined prior to calling powercap_register_zone() to register that zone. - -To Free a power zone call powercap_unregister_zone(). -To free a control type object call powercap_unregister_control_type(). -Detailed API can be generated using kernel-doc on include/linux/powercap.h. diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst new file mode 100644 index 000000000000..0cd8cc1275a7 --- /dev/null +++ b/Documentation/power/regulator/consumer.rst @@ -0,0 +1,229 @@ +=================================== +Regulator Consumer Driver Interface +=================================== + +This text describes the regulator interface for consumer device drivers. +Please see overview.txt for a description of the terms used in this text. + + +1. Consumer Regulator Access (static & dynamic drivers) +======================================================= + +A consumer driver can get access to its supply regulator by calling :: + + regulator = regulator_get(dev, "Vcc"); + +The consumer passes in its struct device pointer and power supply ID. The core +then finds the correct regulator by consulting a machine specific lookup table. +If the lookup is successful then this call will return a pointer to the struct +regulator that supplies this consumer. + +To release the regulator the consumer driver should call :: + + regulator_put(regulator); + +Consumers can be supplied by more than one regulator e.g. codec consumer with +analog and digital supplies :: + + digital = regulator_get(dev, "Vcc"); /* digital core */ + analog = regulator_get(dev, "Avdd"); /* analog */ + +The regulator access functions regulator_get() and regulator_put() will +usually be called in your device drivers probe() and remove() respectively. + + +2. Regulator Output Enable & Disable (static & dynamic drivers) +=============================================================== + + +A consumer can enable its power supply by calling:: + + int regulator_enable(regulator); + +NOTE: + The supply may already be enabled before regulator_enabled() is called. + This may happen if the consumer shares the regulator or the regulator has been + previously enabled by bootloader or kernel board initialization code. + +A consumer can determine if a regulator is enabled by calling:: + + int regulator_is_enabled(regulator); + +This will return > zero when the regulator is enabled. + + +A consumer can disable its supply when no longer needed by calling:: + + int regulator_disable(regulator); + +NOTE: + This may not disable the supply if it's shared with other consumers. The + regulator will only be disabled when the enabled reference count is zero. + +Finally, a regulator can be forcefully disabled in the case of an emergency:: + + int regulator_force_disable(regulator); + +NOTE: + this will immediately and forcefully shutdown the regulator output. All + consumers will be powered off. + + +3. Regulator Voltage Control & Status (dynamic drivers) +======================================================= + +Some consumer drivers need to be able to dynamically change their supply +voltage to match system operating points. e.g. CPUfreq drivers can scale +voltage along with frequency to save power, SD drivers may need to select the +correct card voltage, etc. + +Consumers can control their supply voltage by calling:: + + int regulator_set_voltage(regulator, min_uV, max_uV); + +Where min_uV and max_uV are the minimum and maximum acceptable voltages in +microvolts. + +NOTE: this can be called when the regulator is enabled or disabled. If called +when enabled, then the voltage changes instantly, otherwise the voltage +configuration changes and the voltage is physically set when the regulator is +next enabled. + +The regulators configured voltage output can be found by calling:: + + int regulator_get_voltage(regulator); + +NOTE: + get_voltage() will return the configured output voltage whether the + regulator is enabled or disabled and should NOT be used to determine regulator + output state. However this can be used in conjunction with is_enabled() to + determine the regulator physical output voltage. + + +4. Regulator Current Limit Control & Status (dynamic drivers) +============================================================= + +Some consumer drivers need to be able to dynamically change their supply +current limit to match system operating points. e.g. LCD backlight driver can +change the current limit to vary the backlight brightness, USB drivers may want +to set the limit to 500mA when supplying power. + +Consumers can control their supply current limit by calling:: + + int regulator_set_current_limit(regulator, min_uA, max_uA); + +Where min_uA and max_uA are the minimum and maximum acceptable current limit in +microamps. + +NOTE: + this can be called when the regulator is enabled or disabled. If called + when enabled, then the current limit changes instantly, otherwise the current + limit configuration changes and the current limit is physically set when the + regulator is next enabled. + +A regulators current limit can be found by calling:: + + int regulator_get_current_limit(regulator); + +NOTE: + get_current_limit() will return the current limit whether the regulator + is enabled or disabled and should not be used to determine regulator current + load. + + +5. Regulator Operating Mode Control & Status (dynamic drivers) +============================================================== + +Some consumers can further save system power by changing the operating mode of +their supply regulator to be more efficient when the consumers operating state +changes. e.g. consumer driver is idle and subsequently draws less current + +Regulator operating mode can be changed indirectly or directly. + +Indirect operating mode control. +-------------------------------- +Consumer drivers can request a change in their supply regulator operating mode +by calling:: + + int regulator_set_load(struct regulator *regulator, int load_uA); + +This will cause the core to recalculate the total load on the regulator (based +on all its consumers) and change operating mode (if necessary and permitted) +to best match the current operating load. + +The load_uA value can be determined from the consumer's datasheet. e.g. most +datasheets have tables showing the maximum current consumed in certain +situations. + +Most consumers will use indirect operating mode control since they have no +knowledge of the regulator or whether the regulator is shared with other +consumers. + +Direct operating mode control. +------------------------------ + +Bespoke or tightly coupled drivers may want to directly control regulator +operating mode depending on their operating point. This can be achieved by +calling:: + + int regulator_set_mode(struct regulator *regulator, unsigned int mode); + unsigned int regulator_get_mode(struct regulator *regulator); + +Direct mode will only be used by consumers that *know* about the regulator and +are not sharing the regulator with other consumers. + + +6. Regulator Events +=================== + +Regulators can notify consumers of external events. Events could be received by +consumers under regulator stress or failure conditions. + +Consumers can register interest in regulator events by calling:: + + int regulator_register_notifier(struct regulator *regulator, + struct notifier_block *nb); + +Consumers can unregister interest by calling:: + + int regulator_unregister_notifier(struct regulator *regulator, + struct notifier_block *nb); + +Regulators use the kernel notifier framework to send event to their interested +consumers. + +7. Regulator Direct Register Access +=================================== + +Some kinds of power management hardware or firmware are designed such that +they need to do low-level hardware access to regulators, with no involvement +from the kernel. Examples of such devices are: + +- clocksource with a voltage-controlled oscillator and control logic to change + the supply voltage over I2C to achieve a desired output clock rate +- thermal management firmware that can issue an arbitrary I2C transaction to + perform system poweroff during overtemperature conditions + +To set up such a device/firmware, various parameters like I2C address of the +regulator, addresses of various regulator registers etc. need to be configured +to it. The regulator framework provides the following helpers for querying +these details. + +Bus-specific details, like I2C addresses or transfer rates are handled by the +regmap framework. To get the regulator's regmap (if supported), use:: + + struct regmap *regulator_get_regmap(struct regulator *regulator); + +To obtain the hardware register offset and bitmask for the regulator's voltage +selector register, use:: + + int regulator_get_hardware_vsel_register(struct regulator *regulator, + unsigned *vsel_reg, + unsigned *vsel_mask); + +To convert a regulator framework voltage selector code (used by +regulator_list_voltage) to a hardware-specific voltage selector that can be +directly written to the voltage selector register, use:: + + int regulator_list_hardware_vsel(struct regulator *regulator, + unsigned selector); diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt deleted file mode 100644 index e51564c1a140..000000000000 --- a/Documentation/power/regulator/consumer.txt +++ /dev/null @@ -1,218 +0,0 @@ -Regulator Consumer Driver Interface -=================================== - -This text describes the regulator interface for consumer device drivers. -Please see overview.txt for a description of the terms used in this text. - - -1. Consumer Regulator Access (static & dynamic drivers) -======================================================= - -A consumer driver can get access to its supply regulator by calling :- - -regulator = regulator_get(dev, "Vcc"); - -The consumer passes in its struct device pointer and power supply ID. The core -then finds the correct regulator by consulting a machine specific lookup table. -If the lookup is successful then this call will return a pointer to the struct -regulator that supplies this consumer. - -To release the regulator the consumer driver should call :- - -regulator_put(regulator); - -Consumers can be supplied by more than one regulator e.g. codec consumer with -analog and digital supplies :- - -digital = regulator_get(dev, "Vcc"); /* digital core */ -analog = regulator_get(dev, "Avdd"); /* analog */ - -The regulator access functions regulator_get() and regulator_put() will -usually be called in your device drivers probe() and remove() respectively. - - -2. Regulator Output Enable & Disable (static & dynamic drivers) -==================================================================== - -A consumer can enable its power supply by calling:- - -int regulator_enable(regulator); - -NOTE: The supply may already be enabled before regulator_enabled() is called. -This may happen if the consumer shares the regulator or the regulator has been -previously enabled by bootloader or kernel board initialization code. - -A consumer can determine if a regulator is enabled by calling :- - -int regulator_is_enabled(regulator); - -This will return > zero when the regulator is enabled. - - -A consumer can disable its supply when no longer needed by calling :- - -int regulator_disable(regulator); - -NOTE: This may not disable the supply if it's shared with other consumers. The -regulator will only be disabled when the enabled reference count is zero. - -Finally, a regulator can be forcefully disabled in the case of an emergency :- - -int regulator_force_disable(regulator); - -NOTE: this will immediately and forcefully shutdown the regulator output. All -consumers will be powered off. - - -3. Regulator Voltage Control & Status (dynamic drivers) -====================================================== - -Some consumer drivers need to be able to dynamically change their supply -voltage to match system operating points. e.g. CPUfreq drivers can scale -voltage along with frequency to save power, SD drivers may need to select the -correct card voltage, etc. - -Consumers can control their supply voltage by calling :- - -int regulator_set_voltage(regulator, min_uV, max_uV); - -Where min_uV and max_uV are the minimum and maximum acceptable voltages in -microvolts. - -NOTE: this can be called when the regulator is enabled or disabled. If called -when enabled, then the voltage changes instantly, otherwise the voltage -configuration changes and the voltage is physically set when the regulator is -next enabled. - -The regulators configured voltage output can be found by calling :- - -int regulator_get_voltage(regulator); - -NOTE: get_voltage() will return the configured output voltage whether the -regulator is enabled or disabled and should NOT be used to determine regulator -output state. However this can be used in conjunction with is_enabled() to -determine the regulator physical output voltage. - - -4. Regulator Current Limit Control & Status (dynamic drivers) -=========================================================== - -Some consumer drivers need to be able to dynamically change their supply -current limit to match system operating points. e.g. LCD backlight driver can -change the current limit to vary the backlight brightness, USB drivers may want -to set the limit to 500mA when supplying power. - -Consumers can control their supply current limit by calling :- - -int regulator_set_current_limit(regulator, min_uA, max_uA); - -Where min_uA and max_uA are the minimum and maximum acceptable current limit in -microamps. - -NOTE: this can be called when the regulator is enabled or disabled. If called -when enabled, then the current limit changes instantly, otherwise the current -limit configuration changes and the current limit is physically set when the -regulator is next enabled. - -A regulators current limit can be found by calling :- - -int regulator_get_current_limit(regulator); - -NOTE: get_current_limit() will return the current limit whether the regulator -is enabled or disabled and should not be used to determine regulator current -load. - - -5. Regulator Operating Mode Control & Status (dynamic drivers) -============================================================= - -Some consumers can further save system power by changing the operating mode of -their supply regulator to be more efficient when the consumers operating state -changes. e.g. consumer driver is idle and subsequently draws less current - -Regulator operating mode can be changed indirectly or directly. - -Indirect operating mode control. --------------------------------- -Consumer drivers can request a change in their supply regulator operating mode -by calling :- - -int regulator_set_load(struct regulator *regulator, int load_uA); - -This will cause the core to recalculate the total load on the regulator (based -on all its consumers) and change operating mode (if necessary and permitted) -to best match the current operating load. - -The load_uA value can be determined from the consumer's datasheet. e.g. most -datasheets have tables showing the maximum current consumed in certain -situations. - -Most consumers will use indirect operating mode control since they have no -knowledge of the regulator or whether the regulator is shared with other -consumers. - -Direct operating mode control. ------------------------------- -Bespoke or tightly coupled drivers may want to directly control regulator -operating mode depending on their operating point. This can be achieved by -calling :- - -int regulator_set_mode(struct regulator *regulator, unsigned int mode); -unsigned int regulator_get_mode(struct regulator *regulator); - -Direct mode will only be used by consumers that *know* about the regulator and -are not sharing the regulator with other consumers. - - -6. Regulator Events -=================== -Regulators can notify consumers of external events. Events could be received by -consumers under regulator stress or failure conditions. - -Consumers can register interest in regulator events by calling :- - -int regulator_register_notifier(struct regulator *regulator, - struct notifier_block *nb); - -Consumers can unregister interest by calling :- - -int regulator_unregister_notifier(struct regulator *regulator, - struct notifier_block *nb); - -Regulators use the kernel notifier framework to send event to their interested -consumers. - -7. Regulator Direct Register Access -=================================== -Some kinds of power management hardware or firmware are designed such that -they need to do low-level hardware access to regulators, with no involvement -from the kernel. Examples of such devices are: - -- clocksource with a voltage-controlled oscillator and control logic to change - the supply voltage over I2C to achieve a desired output clock rate -- thermal management firmware that can issue an arbitrary I2C transaction to - perform system poweroff during overtemperature conditions - -To set up such a device/firmware, various parameters like I2C address of the -regulator, addresses of various regulator registers etc. need to be configured -to it. The regulator framework provides the following helpers for querying -these details. - -Bus-specific details, like I2C addresses or transfer rates are handled by the -regmap framework. To get the regulator's regmap (if supported), use :- - -struct regmap *regulator_get_regmap(struct regulator *regulator); - -To obtain the hardware register offset and bitmask for the regulator's voltage -selector register, use :- - -int regulator_get_hardware_vsel_register(struct regulator *regulator, - unsigned *vsel_reg, - unsigned *vsel_mask); - -To convert a regulator framework voltage selector code (used by -regulator_list_voltage) to a hardware-specific voltage selector that can be -directly written to the voltage selector register, use :- - -int regulator_list_hardware_vsel(struct regulator *regulator, - unsigned selector); diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst new file mode 100644 index 000000000000..3b09c6841dc4 --- /dev/null +++ b/Documentation/power/regulator/design.rst @@ -0,0 +1,38 @@ +========================== +Regulator API design notes +========================== + +This document provides a brief, partially structured, overview of some +of the design considerations which impact the regulator API design. + +Safety +------ + + - Errors in regulator configuration can have very serious consequences + for the system, potentially including lasting hardware damage. + - It is not possible to automatically determine the power configuration + of the system - software-equivalent variants of the same chip may + have different power requirements, and not all components with power + requirements are visible to software. + +.. note:: + + The API should make no changes to the hardware state unless it has + specific knowledge that these changes are safe to perform on this + particular system. + +Consumer use cases +------------------ + + - The overwhelming majority of devices in a system will have no + requirement to do any runtime configuration of their power beyond + being able to turn it on or off. + + - Many of the power supplies in the system will be shared between many + different consumers. + +.. note:: + + The consumer API should be structured so that these use cases are + very easy to handle and so that consumers will work with shared + supplies without any additional effort. diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt deleted file mode 100644 index fdd919b96830..000000000000 --- a/Documentation/power/regulator/design.txt +++ /dev/null @@ -1,33 +0,0 @@ -Regulator API design notes -========================== - -This document provides a brief, partially structured, overview of some -of the design considerations which impact the regulator API design. - -Safety ------- - - - Errors in regulator configuration can have very serious consequences - for the system, potentially including lasting hardware damage. - - It is not possible to automatically determine the power configuration - of the system - software-equivalent variants of the same chip may - have different power requirements, and not all components with power - requirements are visible to software. - - => The API should make no changes to the hardware state unless it has - specific knowledge that these changes are safe to perform on this - particular system. - -Consumer use cases ------------------- - - - The overwhelming majority of devices in a system will have no - requirement to do any runtime configuration of their power beyond - being able to turn it on or off. - - - Many of the power supplies in the system will be shared between many - different consumers. - - => The consumer API should be structured so that these use cases are - very easy to handle and so that consumers will work with shared - supplies without any additional effort. diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst new file mode 100644 index 000000000000..22fffefaa3ad --- /dev/null +++ b/Documentation/power/regulator/machine.rst @@ -0,0 +1,97 @@ +================================== +Regulator Machine Driver Interface +================================== + +The regulator machine driver interface is intended for board/machine specific +initialisation code to configure the regulator subsystem. + +Consider the following machine:: + + Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V] + | + +-> [Consumer B @ 3.3V] + +The drivers for consumers A & B must be mapped to the correct regulator in +order to control their power supplies. This mapping can be achieved in machine +initialisation code by creating a struct regulator_consumer_supply for +each regulator:: + + struct regulator_consumer_supply { + const char *dev_name; /* consumer dev_name() */ + const char *supply; /* consumer supply - e.g. "vcc" */ + }; + +e.g. for the machine above:: + + static struct regulator_consumer_supply regulator1_consumers[] = { + REGULATOR_SUPPLY("Vcc", "consumer B"), + }; + + static struct regulator_consumer_supply regulator2_consumers[] = { + REGULATOR_SUPPLY("Vcc", "consumer A"), + }; + +This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2 +to the 'Vcc' supply for Consumer A. + +Constraints can now be registered by defining a struct regulator_init_data +for each regulator power domain. This structure also maps the consumers +to their supply regulators:: + + static struct regulator_init_data regulator1_data = { + .constraints = { + .name = "Regulator-1", + .min_uV = 3300000, + .max_uV = 3300000, + .valid_modes_mask = REGULATOR_MODE_NORMAL, + }, + .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers), + .consumer_supplies = regulator1_consumers, + }; + +The name field should be set to something that is usefully descriptive +for the board for configuration of supplies for other regulators and +for use in logging and other diagnostic output. Normally the name +used for the supply rail in the schematic is a good choice. If no +name is provided then the subsystem will choose one. + +Regulator-1 supplies power to Regulator-2. This relationship must be registered +with the core so that Regulator-1 is also enabled when Consumer A enables its +supply (Regulator-2). The supply regulator is set by the supply_regulator +field below and co:: + + static struct regulator_init_data regulator2_data = { + .supply_regulator = "Regulator-1", + .constraints = { + .min_uV = 1800000, + .max_uV = 2000000, + .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE, + .valid_modes_mask = REGULATOR_MODE_NORMAL, + }, + .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers), + .consumer_supplies = regulator2_consumers, + }; + +Finally the regulator devices must be registered in the usual manner:: + + static struct platform_device regulator_devices[] = { + { + .name = "regulator", + .id = DCDC_1, + .dev = { + .platform_data = ®ulator1_data, + }, + }, + { + .name = "regulator", + .id = DCDC_2, + .dev = { + .platform_data = ®ulator2_data, + }, + }, + }; + /* register regulator 1 device */ + platform_device_register(®ulator_devices[0]); + + /* register regulator 2 device */ + platform_device_register(®ulator_devices[1]); diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt deleted file mode 100644 index eff4dcaaa252..000000000000 --- a/Documentation/power/regulator/machine.txt +++ /dev/null @@ -1,96 +0,0 @@ -Regulator Machine Driver Interface -=================================== - -The regulator machine driver interface is intended for board/machine specific -initialisation code to configure the regulator subsystem. - -Consider the following machine :- - - Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V] - | - +-> [Consumer B @ 3.3V] - -The drivers for consumers A & B must be mapped to the correct regulator in -order to control their power supplies. This mapping can be achieved in machine -initialisation code by creating a struct regulator_consumer_supply for -each regulator. - -struct regulator_consumer_supply { - const char *dev_name; /* consumer dev_name() */ - const char *supply; /* consumer supply - e.g. "vcc" */ -}; - -e.g. for the machine above - -static struct regulator_consumer_supply regulator1_consumers[] = { - REGULATOR_SUPPLY("Vcc", "consumer B"), -}; - -static struct regulator_consumer_supply regulator2_consumers[] = { - REGULATOR_SUPPLY("Vcc", "consumer A"), -}; - -This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2 -to the 'Vcc' supply for Consumer A. - -Constraints can now be registered by defining a struct regulator_init_data -for each regulator power domain. This structure also maps the consumers -to their supply regulators :- - -static struct regulator_init_data regulator1_data = { - .constraints = { - .name = "Regulator-1", - .min_uV = 3300000, - .max_uV = 3300000, - .valid_modes_mask = REGULATOR_MODE_NORMAL, - }, - .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers), - .consumer_supplies = regulator1_consumers, -}; - -The name field should be set to something that is usefully descriptive -for the board for configuration of supplies for other regulators and -for use in logging and other diagnostic output. Normally the name -used for the supply rail in the schematic is a good choice. If no -name is provided then the subsystem will choose one. - -Regulator-1 supplies power to Regulator-2. This relationship must be registered -with the core so that Regulator-1 is also enabled when Consumer A enables its -supply (Regulator-2). The supply regulator is set by the supply_regulator -field below and co:- - -static struct regulator_init_data regulator2_data = { - .supply_regulator = "Regulator-1", - .constraints = { - .min_uV = 1800000, - .max_uV = 2000000, - .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE, - .valid_modes_mask = REGULATOR_MODE_NORMAL, - }, - .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers), - .consumer_supplies = regulator2_consumers, -}; - -Finally the regulator devices must be registered in the usual manner. - -static struct platform_device regulator_devices[] = { - { - .name = "regulator", - .id = DCDC_1, - .dev = { - .platform_data = ®ulator1_data, - }, - }, - { - .name = "regulator", - .id = DCDC_2, - .dev = { - .platform_data = ®ulator2_data, - }, - }, -}; -/* register regulator 1 device */ -platform_device_register(®ulator_devices[0]); - -/* register regulator 2 device */ -platform_device_register(®ulator_devices[1]); diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst new file mode 100644 index 000000000000..ee494c70a7c4 --- /dev/null +++ b/Documentation/power/regulator/overview.rst @@ -0,0 +1,178 @@ +============================================= +Linux voltage and current regulator framework +============================================= + +About +===== + +This framework is designed to provide a standard kernel interface to control +voltage and current regulators. + +The intention is to allow systems to dynamically control regulator power output +in order to save power and prolong battery life. This applies to both voltage +regulators (where voltage output is controllable) and current sinks (where +current limit is controllable). + +(C) 2008 Wolfson Microelectronics PLC. + +Author: Liam Girdwood + + +Nomenclature +============ + +Some terms used in this document: + + - Regulator + - Electronic device that supplies power to other devices. + Most regulators can enable and disable their output while + some can control their output voltage and or current. + + Input Voltage -> Regulator -> Output Voltage + + + - PMIC + - Power Management IC. An IC that contains numerous + regulators and often contains other subsystems. + + + - Consumer + - Electronic device that is supplied power by a regulator. + Consumers can be classified into two types:- + + Static: consumer does not change its supply voltage or + current limit. It only needs to enable or disable its + power supply. Its supply voltage is set by the hardware, + bootloader, firmware or kernel board initialisation code. + + Dynamic: consumer needs to change its supply voltage or + current limit to meet operation demands. + + + - Power Domain + - Electronic circuit that is supplied its input power by the + output power of a regulator, switch or by another power + domain. + + The supply regulator may be behind a switch(s). i.e.:: + + Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A] + | | + | +-> [Consumer B], [Consumer C] + | + +-> [Consumer D], [Consumer E] + + That is one regulator and three power domains: + + - Domain 1: Switch-1, Consumers D & E. + - Domain 2: Switch-2, Consumers B & C. + - Domain 3: Consumer A. + + and this represents a "supplies" relationship: + + Domain-1 --> Domain-2 --> Domain-3. + + A power domain may have regulators that are supplied power + by other regulators. i.e.:: + + Regulator-1 -+-> Regulator-2 -+-> [Consumer A] + | + +-> [Consumer B] + + This gives us two regulators and two power domains: + + - Domain 1: Regulator-2, Consumer B. + - Domain 2: Consumer A. + + and a "supplies" relationship: + + Domain-1 --> Domain-2 + + + - Constraints + - Constraints are used to define power levels for performance + and hardware protection. Constraints exist at three levels: + + Regulator Level: This is defined by the regulator hardware + operating parameters and is specified in the regulator + datasheet. i.e. + + - voltage output is in the range 800mV -> 3500mV. + - regulator current output limit is 20mA @ 5V but is + 10mA @ 10V. + + Power Domain Level: This is defined in software by kernel + level board initialisation code. It is used to constrain a + power domain to a particular power range. i.e. + + - Domain-1 voltage is 3300mV + - Domain-2 voltage is 1400mV -> 1600mV + - Domain-3 current limit is 0mA -> 20mA. + + Consumer Level: This is defined by consumer drivers + dynamically setting voltage or current limit levels. + + e.g. a consumer backlight driver asks for a current increase + from 5mA to 10mA to increase LCD illumination. This passes + to through the levels as follows :- + + Consumer: need to increase LCD brightness. Lookup and + request next current mA value in brightness table (the + consumer driver could be used on several different + personalities based upon the same reference device). + + Power Domain: is the new current limit within the domain + operating limits for this domain and system state (e.g. + battery power, USB power) + + Regulator Domains: is the new current limit within the + regulator operating parameters for input/output voltage. + + If the regulator request passes all the constraint tests + then the new regulator value is applied. + + +Design +====== + +The framework is designed and targeted at SoC based devices but may also be +relevant to non SoC devices and is split into the following four interfaces:- + + + 1. Consumer driver interface. + + This uses a similar API to the kernel clock interface in that consumer + drivers can get and put a regulator (like they can with clocks atm) and + get/set voltage, current limit, mode, enable and disable. This should + allow consumers complete control over their supply voltage and current + limit. This also compiles out if not in use so drivers can be reused in + systems with no regulator based power control. + + See Documentation/power/regulator/consumer.rst + + 2. Regulator driver interface. + + This allows regulator drivers to register their regulators and provide + operations to the core. It also has a notifier call chain for propagating + regulator events to clients. + + See Documentation/power/regulator/regulator.rst + + 3. Machine interface. + + This interface is for machine specific code and allows the creation of + voltage/current domains (with constraints) for each regulator. It can + provide regulator constraints that will prevent device damage through + overvoltage or overcurrent caused by buggy client drivers. It also + allows the creation of a regulator tree whereby some regulators are + supplied by others (similar to a clock tree). + + See Documentation/power/regulator/machine.rst + + 4. Userspace ABI. + + The framework also exports a lot of useful voltage/current/opmode data to + userspace via sysfs. This could be used to help monitor device power + consumption and status. + + See Documentation/ABI/testing/sysfs-class-regulator diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt deleted file mode 100644 index 721b4739ec32..000000000000 --- a/Documentation/power/regulator/overview.txt +++ /dev/null @@ -1,171 +0,0 @@ -Linux voltage and current regulator framework -============================================= - -About -===== - -This framework is designed to provide a standard kernel interface to control -voltage and current regulators. - -The intention is to allow systems to dynamically control regulator power output -in order to save power and prolong battery life. This applies to both voltage -regulators (where voltage output is controllable) and current sinks (where -current limit is controllable). - -(C) 2008 Wolfson Microelectronics PLC. -Author: Liam Girdwood - - -Nomenclature -============ - -Some terms used in this document:- - - o Regulator - Electronic device that supplies power to other devices. - Most regulators can enable and disable their output while - some can control their output voltage and or current. - - Input Voltage -> Regulator -> Output Voltage - - - o PMIC - Power Management IC. An IC that contains numerous regulators - and often contains other subsystems. - - - o Consumer - Electronic device that is supplied power by a regulator. - Consumers can be classified into two types:- - - Static: consumer does not change its supply voltage or - current limit. It only needs to enable or disable its - power supply. Its supply voltage is set by the hardware, - bootloader, firmware or kernel board initialisation code. - - Dynamic: consumer needs to change its supply voltage or - current limit to meet operation demands. - - - o Power Domain - Electronic circuit that is supplied its input power by the - output power of a regulator, switch or by another power - domain. - - The supply regulator may be behind a switch(s). i.e. - - Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A] - | | - | +-> [Consumer B], [Consumer C] - | - +-> [Consumer D], [Consumer E] - - That is one regulator and three power domains: - - Domain 1: Switch-1, Consumers D & E. - Domain 2: Switch-2, Consumers B & C. - Domain 3: Consumer A. - - and this represents a "supplies" relationship: - - Domain-1 --> Domain-2 --> Domain-3. - - A power domain may have regulators that are supplied power - by other regulators. i.e. - - Regulator-1 -+-> Regulator-2 -+-> [Consumer A] - | - +-> [Consumer B] - - This gives us two regulators and two power domains: - - Domain 1: Regulator-2, Consumer B. - Domain 2: Consumer A. - - and a "supplies" relationship: - - Domain-1 --> Domain-2 - - - o Constraints - Constraints are used to define power levels for performance - and hardware protection. Constraints exist at three levels: - - Regulator Level: This is defined by the regulator hardware - operating parameters and is specified in the regulator - datasheet. i.e. - - - voltage output is in the range 800mV -> 3500mV. - - regulator current output limit is 20mA @ 5V but is - 10mA @ 10V. - - Power Domain Level: This is defined in software by kernel - level board initialisation code. It is used to constrain a - power domain to a particular power range. i.e. - - - Domain-1 voltage is 3300mV - - Domain-2 voltage is 1400mV -> 1600mV - - Domain-3 current limit is 0mA -> 20mA. - - Consumer Level: This is defined by consumer drivers - dynamically setting voltage or current limit levels. - - e.g. a consumer backlight driver asks for a current increase - from 5mA to 10mA to increase LCD illumination. This passes - to through the levels as follows :- - - Consumer: need to increase LCD brightness. Lookup and - request next current mA value in brightness table (the - consumer driver could be used on several different - personalities based upon the same reference device). - - Power Domain: is the new current limit within the domain - operating limits for this domain and system state (e.g. - battery power, USB power) - - Regulator Domains: is the new current limit within the - regulator operating parameters for input/output voltage. - - If the regulator request passes all the constraint tests - then the new regulator value is applied. - - -Design -====== - -The framework is designed and targeted at SoC based devices but may also be -relevant to non SoC devices and is split into the following four interfaces:- - - - 1. Consumer driver interface. - - This uses a similar API to the kernel clock interface in that consumer - drivers can get and put a regulator (like they can with clocks atm) and - get/set voltage, current limit, mode, enable and disable. This should - allow consumers complete control over their supply voltage and current - limit. This also compiles out if not in use so drivers can be reused in - systems with no regulator based power control. - - See Documentation/power/regulator/consumer.txt - - 2. Regulator driver interface. - - This allows regulator drivers to register their regulators and provide - operations to the core. It also has a notifier call chain for propagating - regulator events to clients. - - See Documentation/power/regulator/regulator.txt - - 3. Machine interface. - - This interface is for machine specific code and allows the creation of - voltage/current domains (with constraints) for each regulator. It can - provide regulator constraints that will prevent device damage through - overvoltage or overcurrent caused by buggy client drivers. It also - allows the creation of a regulator tree whereby some regulators are - supplied by others (similar to a clock tree). - - See Documentation/power/regulator/machine.txt - - 4. Userspace ABI. - - The framework also exports a lot of useful voltage/current/opmode data to - userspace via sysfs. This could be used to help monitor device power - consumption and status. - - See Documentation/ABI/testing/sysfs-class-regulator diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst new file mode 100644 index 000000000000..794b3256fbb9 --- /dev/null +++ b/Documentation/power/regulator/regulator.rst @@ -0,0 +1,32 @@ +========================== +Regulator Driver Interface +========================== + +The regulator driver interface is relatively simple and designed to allow +regulator drivers to register their services with the core framework. + + +Registration +============ + +Drivers can register a regulator by calling:: + + struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc, + const struct regulator_config *config); + +This will register the regulator's capabilities and operations to the regulator +core. + +Regulators can be unregistered by calling:: + + void regulator_unregister(struct regulator_dev *rdev); + + +Regulator Events +================ + +Regulators can send events (e.g. overtemperature, undervoltage, etc) to +consumer drivers by calling:: + + int regulator_notifier_call_chain(struct regulator_dev *rdev, + unsigned long event, void *data); diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt deleted file mode 100644 index b17e5833ce21..000000000000 --- a/Documentation/power/regulator/regulator.txt +++ /dev/null @@ -1,30 +0,0 @@ -Regulator Driver Interface -========================== - -The regulator driver interface is relatively simple and designed to allow -regulator drivers to register their services with the core framework. - - -Registration -============ - -Drivers can register a regulator by calling :- - -struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc, - const struct regulator_config *config); - -This will register the regulator's capabilities and operations to the regulator -core. - -Regulators can be unregistered by calling :- - -void regulator_unregister(struct regulator_dev *rdev); - - -Regulator Events -================ -Regulators can send events (e.g. overtemperature, undervoltage, etc) to -consumer drivers by calling :- - -int regulator_notifier_call_chain(struct regulator_dev *rdev, - unsigned long event, void *data); diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst new file mode 100644 index 000000000000..2c2ec99b5088 --- /dev/null +++ b/Documentation/power/runtime_pm.rst @@ -0,0 +1,940 @@ +================================================== +Runtime Power Management Framework for I/O Devices +================================================== + +(C) 2009-2011 Rafael J. Wysocki , Novell Inc. + +(C) 2010 Alan Stern + +(C) 2014 Intel Corp., Rafael J. Wysocki + +1. Introduction +=============== + +Support for runtime power management (runtime PM) of I/O devices is provided +at the power management core (PM core) level by means of: + +* The power management workqueue pm_wq in which bus types and device drivers can + put their PM-related work items. It is strongly recommended that pm_wq be + used for queuing all work items related to runtime PM, because this allows + them to be synchronized with system-wide power transitions (suspend to RAM, + hibernation and resume from system sleep states). pm_wq is declared in + include/linux/pm_runtime.h and defined in kernel/power/main.c. + +* A number of runtime PM fields in the 'power' member of 'struct device' (which + is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can + be used for synchronizing runtime PM operations with one another. + +* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in + include/linux/pm.h). + +* A set of helper functions defined in drivers/base/power/runtime.c that can be + used for carrying out runtime PM operations in such a way that the + synchronization between them is taken care of by the PM core. Bus types and + device drivers are encouraged to use these functions. + +The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM +fields of 'struct dev_pm_info' and the core helper functions provided for +runtime PM are described below. + +2. Device Runtime PM Callbacks +============================== + +There are three device runtime PM callbacks defined in 'struct dev_pm_ops':: + + struct dev_pm_ops { + ... + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + int (*runtime_idle)(struct device *dev); + ... + }; + +The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks +are executed by the PM core for the device's subsystem that may be either of +the following: + + 1. PM domain of the device, if the device's PM domain object, dev->pm_domain, + is present. + + 2. Device type of the device, if both dev->type and dev->type->pm are present. + + 3. Device class of the device, if both dev->class and dev->class->pm are + present. + + 4. Bus type of the device, if both dev->bus and dev->bus->pm are present. + +If the subsystem chosen by applying the above rules doesn't provide the relevant +callback, the PM core will invoke the corresponding driver callback stored in +dev->driver->pm directly (if present). + +The PM core always checks which callback to use in the order given above, so the +priority order of callbacks from high to low is: PM domain, device type, class +and bus type. Moreover, the high-priority one will always take precedence over +a low-priority one. The PM domain, bus type, device type and class callbacks +are referred to as subsystem-level callbacks in what follows. + +By default, the callbacks are always invoked in process context with interrupts +enabled. However, the pm_runtime_irq_safe() helper function can be used to tell +the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume() +and ->runtime_idle() callbacks for the given device in atomic context with +interrupts disabled. This implies that the callback routines in question must +not block or sleep, but it also means that the synchronous helper functions +listed at the end of Section 4 may be used for that device within an interrupt +handler or generally in an atomic context. + +The subsystem-level suspend callback, if present, is _entirely_ _responsible_ +for handling the suspend of the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_suspend() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_suspend() +callback in a device driver as long as the subsystem-level suspend callback +knows what to do to handle the device). + + * Once the subsystem-level suspend callback (or the driver suspend callback, + if invoked directly) has completed successfully for the given device, the PM + core regards the device as suspended, which need not mean that it has been + put into a low power state. It is supposed to mean, however, that the + device will not process data and will not communicate with the CPU(s) and + RAM until the appropriate resume callback is executed for it. The runtime + PM status of a device after successful execution of the suspend callback is + 'suspended'. + + * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM + status remains 'active', which means that the device _must_ be fully + operational afterwards. + + * If the suspend callback returns an error code different from -EBUSY and + -EAGAIN, the PM core regards this as a fatal error and will refuse to run + the helper functions described in Section 4 for the device until its status + is directly set to either 'active', or 'suspended' (the PM core provides + special helper functions for this purpose). + +In particular, if the driver requires remote wakeup capability (i.e. hardware +mechanism allowing the device to request a change of its power state, such as +PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the +device, then ->runtime_suspend() should return -EBUSY. On the other hand, if +device_can_wakeup() returns 'true' for the device and the device is put into a +low-power state during the execution of the suspend callback, it is expected +that remote wakeup will be enabled for the device. Generally, remote wakeup +should be enabled for all input devices put into low-power states at run time. + +The subsystem-level resume callback, if present, is **entirely responsible** for +handling the resume of the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_resume() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_resume() +callback in a device driver as long as the subsystem-level resume callback knows +what to do to handle the device). + + * Once the subsystem-level resume callback (or the driver resume callback, if + invoked directly) has completed successfully, the PM core regards the device + as fully operational, which means that the device _must_ be able to complete + I/O operations as needed. The runtime PM status of the device is then + 'active'. + + * If the resume callback returns an error code, the PM core regards this as a + fatal error and will refuse to run the helper functions described in Section + 4 for the device, until its status is directly set to either 'active', or + 'suspended' (by means of special helper functions provided by the PM core + for this purpose). + +The idle callback (a subsystem-level one, if present, or the driver one) is +executed by the PM core whenever the device appears to be idle, which is +indicated to the PM core by two counters, the device's usage counter and the +counter of 'active' children of the device. + + * If any of these counters is decreased using a helper function provided by + the PM core and it turns out to be equal to zero, the other counter is + checked. If that counter also is equal to zero, the PM core executes the + idle callback with the device as its argument. + +The action performed by the idle callback is totally dependent on the subsystem +(or driver) in question, but the expected and recommended action is to check +if the device can be suspended (i.e. if all of the conditions necessary for +suspending the device are satisfied) and to queue up a suspend request for the +device in that case. If there is no idle callback, or if the callback returns +0, then the PM core will attempt to carry out a runtime suspend of the device, +also respecting devices configured for autosuspend. In essence this means a +call to pm_runtime_autosuspend() (do note that drivers needs to update the +device last busy mark, pm_runtime_mark_last_busy(), to control the delay under +this circumstance). To prevent this (for example, if the callback routine has +started a delayed suspend), the routine must return a non-zero value. Negative +error return codes are ignored by the PM core. + +The helper functions provided by the PM core, described in Section 4, guarantee +that the following constraints are met with respect to runtime PM callbacks for +one device: + +(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute + ->runtime_suspend() in parallel with ->runtime_resume() or with another + instance of ->runtime_suspend() for the same device) with the exception that + ->runtime_suspend() or ->runtime_resume() can be executed in parallel with + ->runtime_idle() (although ->runtime_idle() will not be started while any + of the other callbacks is being executed for the same device). + +(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' + devices (i.e. the PM core will only execute ->runtime_idle() or + ->runtime_suspend() for the devices the runtime PM status of which is + 'active'). + +(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device + the usage counter of which is equal to zero _and_ either the counter of + 'active' children of which is equal to zero, or the 'power.ignore_children' + flag of which is set. + +(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the + PM core will only execute ->runtime_resume() for the devices the runtime + PM status of which is 'suspended'). + +Additionally, the helper functions provided by the PM core obey the following +rules: + + * If ->runtime_suspend() is about to be executed or there's a pending request + to execute it, ->runtime_idle() will not be executed for the same device. + + * A request to execute or to schedule the execution of ->runtime_suspend() + will cancel any pending requests to execute ->runtime_idle() for the same + device. + + * If ->runtime_resume() is about to be executed or there's a pending request + to execute it, the other callbacks will not be executed for the same device. + + * A request to execute ->runtime_resume() will cancel any pending or + scheduled requests to execute the other callbacks for the same device, + except for scheduled autosuspends. + +3. Runtime PM Device Fields +=========================== + +The following device runtime PM fields are present in 'struct dev_pm_info', as +defined in include/linux/pm.h: + + `struct timer_list suspend_timer;` + - timer used for scheduling (delayed) suspend and autosuspend requests + + `unsigned long timer_expires;` + - timer expiration time, in jiffies (if this is different from zero, the + timer is running and will expire at that time, otherwise the timer is not + running) + + `struct work_struct work;` + - work structure used for queuing up requests (i.e. work items in pm_wq) + + `wait_queue_head_t wait_queue;` + - wait queue used if any of the helper functions needs to wait for another + one to complete + + `spinlock_t lock;` + - lock used for synchronization + + `atomic_t usage_count;` + - the usage counter of the device + + `atomic_t child_count;` + - the count of 'active' children of the device + + `unsigned int ignore_children;` + - if set, the value of child_count is ignored (but still updated) + + `unsigned int disable_depth;` + - used for disabling the helper functions (they work normally if this is + equal to zero); the initial value of it is 1 (i.e. runtime PM is + initially disabled for all devices) + + `int runtime_error;` + - if set, there was a fatal error (one of the callbacks returned error code + as described in Section 2), so the helper functions will not work until + this flag is cleared; this is the error code returned by the failing + callback + + `unsigned int idle_notification;` + - if set, ->runtime_idle() is being executed + + `unsigned int request_pending;` + - if set, there's a pending request (i.e. a work item queued up into pm_wq) + + `enum rpm_request request;` + - type of request that's pending (valid if request_pending is set) + + `unsigned int deferred_resume;` + - set if ->runtime_resume() is about to be run while ->runtime_suspend() is + being executed for that device and it is not practical to wait for the + suspend to complete; means "start a resume as soon as you've suspended" + + `enum rpm_status runtime_status;` + - the runtime PM status of the device; this field's initial value is + RPM_SUSPENDED, which means that each device is initially regarded by the + PM core as 'suspended', regardless of its real hardware status + + `unsigned int runtime_auto;` + - if set, indicates that the user space has allowed the device driver to + power manage the device at run time via the /sys/devices/.../power/control + `interface;` it may only be modified with the help of the pm_runtime_allow() + and pm_runtime_forbid() helper functions + + `unsigned int no_callbacks;` + - indicates that the device does not use the runtime PM callbacks (see + Section 8); it may be modified only by the pm_runtime_no_callbacks() + helper function + + `unsigned int irq_safe;` + - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks + will be invoked with the spinlock held and interrupts disabled + + `unsigned int use_autosuspend;` + - indicates that the device's driver supports delayed autosuspend (see + Section 9); it may be modified only by the + pm_runtime{_dont}_use_autosuspend() helper functions + + `unsigned int timer_autosuspends;` + - indicates that the PM core should attempt to carry out an autosuspend + when the timer expires rather than a normal suspend + + `int autosuspend_delay;` + - the delay time (in milliseconds) to be used for autosuspend + + `unsigned long last_busy;` + - the time (in jiffies) when the pm_runtime_mark_last_busy() helper + function was last called for this device; used in calculating inactivity + periods for autosuspend + +All of the above fields are members of the 'power' member of 'struct device'. + +4. Runtime PM Device Helper Functions +===================================== + +The following runtime PM helper functions are defined in +drivers/base/power/runtime.c and include/linux/pm_runtime.h: + + `void pm_runtime_init(struct device *dev);` + - initialize the device runtime PM fields in 'struct dev_pm_info' + + `void pm_runtime_remove(struct device *dev);` + - make sure that the runtime PM of the device will be disabled after + removing the device from device hierarchy + + `int pm_runtime_idle(struct device *dev);` + - execute the subsystem-level idle callback for the device; returns an + error code on failure, where -EINPROGRESS means that ->runtime_idle() is + already being executed; if there is no callback or the callback returns 0 + then run pm_runtime_autosuspend(dev) and return its result + + `int pm_runtime_suspend(struct device *dev);` + - execute the subsystem-level suspend callback for the device; returns 0 on + success, 1 if the device's runtime PM status was already 'suspended', or + error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt + to suspend the device again in future and -EACCES means that + 'power.disable_depth' is different from 0 + + `int pm_runtime_autosuspend(struct device *dev);` + - same as pm_runtime_suspend() except that the autosuspend delay is taken + `into account;` if pm_runtime_autosuspend_expiration() says the delay has + not yet expired then an autosuspend is scheduled for the appropriate time + and 0 is returned + + `int pm_runtime_resume(struct device *dev);` + - execute the subsystem-level resume callback for the device; returns 0 on + success, 1 if the device's runtime PM status was already 'active' or + error code on failure, where -EAGAIN means it may be safe to attempt to + resume the device again in future, but 'power.runtime_error' should be + checked additionally, and -EACCES means that 'power.disable_depth' is + different from 0 + + `int pm_request_idle(struct device *dev);` + - submit a request to execute the subsystem-level idle callback for the + device (the request is represented by a work item in pm_wq); returns 0 on + success or error code if the request has not been queued up + + `int pm_request_autosuspend(struct device *dev);` + - schedule the execution of the subsystem-level suspend callback for the + device when the autosuspend delay has expired; if the delay has already + expired then the work item is queued up immediately + + `int pm_schedule_suspend(struct device *dev, unsigned int delay);` + - schedule the execution of the subsystem-level suspend callback for the + device in future, where 'delay' is the time to wait before queuing up a + suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work + item is queued up immediately); returns 0 on success, 1 if the device's PM + runtime status was already 'suspended', or error code if the request + hasn't been scheduled (or queued up if 'delay' is 0); if the execution of + ->runtime_suspend() is already scheduled and not yet expired, the new + value of 'delay' will be used as the time to wait + + `int pm_request_resume(struct device *dev);` + - submit a request to execute the subsystem-level resume callback for the + device (the request is represented by a work item in pm_wq); returns 0 on + success, 1 if the device's runtime PM status was already 'active', or + error code if the request hasn't been queued up + + `void pm_runtime_get_noresume(struct device *dev);` + - increment the device's usage counter + + `int pm_runtime_get(struct device *dev);` + - increment the device's usage counter, run pm_request_resume(dev) and + return its result + + `int pm_runtime_get_sync(struct device *dev);` + - increment the device's usage counter, run pm_runtime_resume(dev) and + return its result + + `int pm_runtime_get_if_in_use(struct device *dev);` + - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the + runtime PM status is RPM_ACTIVE and the runtime PM usage counter is + nonzero, increment the counter and return 1; otherwise return 0 without + changing the counter + + `void pm_runtime_put_noidle(struct device *dev);` + - decrement the device's usage counter + + `int pm_runtime_put(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_request_idle(dev) and return its result + + `int pm_runtime_put_autosuspend(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_request_autosuspend(dev) and return its result + + `int pm_runtime_put_sync(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_runtime_idle(dev) and return its result + + `int pm_runtime_put_sync_suspend(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_runtime_suspend(dev) and return its result + + `int pm_runtime_put_sync_autosuspend(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_runtime_autosuspend(dev) and return its result + + `void pm_runtime_enable(struct device *dev);` + - decrement the device's 'power.disable_depth' field; if that field is equal + to zero, the runtime PM helper functions can execute subsystem-level + callbacks described in Section 2 for the device + + `int pm_runtime_disable(struct device *dev);` + - increment the device's 'power.disable_depth' field (if the value of that + field was previously zero, this prevents subsystem-level runtime PM + callbacks from being run for the device), make sure that all of the + pending runtime PM operations on the device are either completed or + canceled; returns 1 if there was a resume request pending and it was + necessary to execute the subsystem-level resume callback for the device + to satisfy that request, otherwise 0 is returned + + `int pm_runtime_barrier(struct device *dev);` + - check if there's a resume request pending for the device and resume it + (synchronously) in that case, cancel any other pending runtime PM requests + regarding it and wait for all runtime PM operations on it in progress to + complete; returns 1 if there was a resume request pending and it was + necessary to execute the subsystem-level resume callback for the device to + satisfy that request, otherwise 0 is returned + + `void pm_suspend_ignore_children(struct device *dev, bool enable);` + - set/unset the power.ignore_children flag of the device + + `int pm_runtime_set_active(struct device *dev);` + - clear the device's 'power.runtime_error' flag, set the device's runtime + PM status to 'active' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero); it will fail and return error code if the device has a parent + which is not active and the 'power.ignore_children' flag of which is unset + + `void pm_runtime_set_suspended(struct device *dev);` + - clear the device's 'power.runtime_error' flag, set the device's runtime + PM status to 'suspended' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero) + + `bool pm_runtime_active(struct device *dev);` + - return true if the device's runtime PM status is 'active' or its + 'power.disable_depth' field is not equal to zero, or false otherwise + + `bool pm_runtime_suspended(struct device *dev);` + - return true if the device's runtime PM status is 'suspended' and its + 'power.disable_depth' field is equal to zero, or false otherwise + + `bool pm_runtime_status_suspended(struct device *dev);` + - return true if the device's runtime PM status is 'suspended' + + `void pm_runtime_allow(struct device *dev);` + - set the power.runtime_auto flag for the device and decrease its usage + counter (used by the /sys/devices/.../power/control interface to + effectively allow the device to be power managed at run time) + + `void pm_runtime_forbid(struct device *dev);` + - unset the power.runtime_auto flag for the device and increase its usage + counter (used by the /sys/devices/.../power/control interface to + effectively prevent the device from being power managed at run time) + + `void pm_runtime_no_callbacks(struct device *dev);` + - set the power.no_callbacks flag for the device and remove the runtime + PM attributes from /sys/devices/.../power (or prevent them from being + added when the device is registered) + + `void pm_runtime_irq_safe(struct device *dev);` + - set the power.irq_safe flag for the device, causing the runtime-PM + callbacks to be invoked with interrupts off + + `bool pm_runtime_is_irq_safe(struct device *dev);` + - return true if power.irq_safe flag was set for the device, causing + the runtime-PM callbacks to be invoked with interrupts off + + `void pm_runtime_mark_last_busy(struct device *dev);` + - set the power.last_busy field to the current time + + `void pm_runtime_use_autosuspend(struct device *dev);` + - set the power.use_autosuspend flag, enabling autosuspend delays; call + pm_runtime_get_sync if the flag was previously cleared and + power.autosuspend_delay is negative + + `void pm_runtime_dont_use_autosuspend(struct device *dev);` + - clear the power.use_autosuspend flag, disabling autosuspend delays; + decrement the device's usage counter if the flag was previously set and + power.autosuspend_delay is negative; call pm_runtime_idle + + `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);` + - set the power.autosuspend_delay value to 'delay' (expressed in + milliseconds); if 'delay' is negative then runtime suspends are + prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be + called or the device's usage counter may be decremented and + pm_runtime_idle called depending on if power.autosuspend_delay is + changed to or from a negative value; if power.use_autosuspend is clear, + pm_runtime_idle is called + + `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);` + - calculate the time when the current autosuspend delay period will expire, + based on power.last_busy and power.autosuspend_delay; if the delay time + is 1000 ms or larger then the expiration time is rounded up to the + nearest second; returns 0 if the delay period has already expired or + power.use_autosuspend isn't set, otherwise returns the expiration time + in jiffies + +It is safe to execute the following helper functions from interrupt context: + +- pm_request_idle() +- pm_request_autosuspend() +- pm_schedule_suspend() +- pm_request_resume() +- pm_runtime_get_noresume() +- pm_runtime_get() +- pm_runtime_put_noidle() +- pm_runtime_put() +- pm_runtime_put_autosuspend() +- pm_runtime_enable() +- pm_suspend_ignore_children() +- pm_runtime_set_active() +- pm_runtime_set_suspended() +- pm_runtime_suspended() +- pm_runtime_mark_last_busy() +- pm_runtime_autosuspend_expiration() + +If pm_runtime_irq_safe() has been called for a device then the following helper +functions may also be used in interrupt context: + +- pm_runtime_idle() +- pm_runtime_suspend() +- pm_runtime_autosuspend() +- pm_runtime_resume() +- pm_runtime_get_sync() +- pm_runtime_put_sync() +- pm_runtime_put_sync_suspend() +- pm_runtime_put_sync_autosuspend() + +5. Runtime PM Initialization, Device Probing and Removal +======================================================== + +Initially, the runtime PM is disabled for all devices, which means that the +majority of the runtime PM helper functions described in Section 4 will return +-EAGAIN until pm_runtime_enable() is called for the device. + +In addition to that, the initial runtime PM status of all devices is +'suspended', but it need not reflect the actual physical state of the device. +Thus, if the device is initially active (i.e. it is able to process I/O), its +runtime PM status must be changed to 'active', with the help of +pm_runtime_set_active(), before pm_runtime_enable() is called for the device. + +However, if the device has a parent and the parent's runtime PM is enabled, +calling pm_runtime_set_active() for the device will affect the parent, unless +the parent's 'power.ignore_children' flag is set. Namely, in that case the +parent won't be able to suspend at run time, using the PM core's helper +functions, as long as the child's status is 'active', even if the child's +runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for +the child yet or pm_runtime_disable() has been called for it). For this reason, +once pm_runtime_set_active() has been called for the device, pm_runtime_enable() +should be called for it too as soon as reasonably possible or its runtime PM +status should be changed back to 'suspended' with the help of +pm_runtime_set_suspended(). + +If the default initial runtime PM status of the device (i.e. 'suspended') +reflects the actual state of the device, its bus type's or its driver's +->probe() callback will likely need to wake it up using one of the PM core's +helper functions described in Section 4. In that case, pm_runtime_resume() +should be used. Of course, for this purpose the device's runtime PM has to be +enabled earlier by calling pm_runtime_enable(). + +Note, if the device may execute pm_runtime calls during the probe (such as +if it is registers with a subsystem that may call back in) then the +pm_runtime_get_sync() call paired with a pm_runtime_put() call will be +appropriate to ensure that the device is not put back to sleep during the +probe. This can happen with systems such as the network device layer. + +It may be desirable to suspend the device once ->probe() has finished. +Therefore the driver core uses the asynchronous pm_request_idle() to submit a +request to execute the subsystem-level idle callback for the device at that +time. A driver that makes use of the runtime autosuspend feature, may want to +update the last busy mark before returning from ->probe(). + +Moreover, the driver core prevents runtime PM callbacks from racing with the bus +notifier callback in __device_release_driver(), which is necessary, because the +notifier is used by some subsystems to carry out operations affecting the +runtime PM functionality. It does so by calling pm_runtime_get_sync() before +driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This +resumes the device if it's in the suspended state and prevents it from +being suspended again while those routines are being executed. + +To allow bus types and drivers to put devices into the suspended state by +calling pm_runtime_suspend() from their ->remove() routines, the driver core +executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER +notifications in __device_release_driver(). This requires bus types and +drivers to make their ->remove() callbacks avoid races with runtime PM directly, +but also it allows of more flexibility in the handling of devices during the +removal of their drivers. + +Drivers in ->remove() callback should undo the runtime PM changes done +in ->probe(). Usually this means calling pm_runtime_disable(), +pm_runtime_dont_use_autosuspend() etc. + +The user space can effectively disallow the driver of the device to power manage +it at run time by changing the value of its /sys/devices/.../power/control +attribute to "on", which causes pm_runtime_forbid() to be called. In principle, +this mechanism may also be used by the driver to effectively turn off the +runtime power management of the device until the user space turns it on. +Namely, during the initialization the driver can make sure that the runtime PM +status of the device is 'active' and call pm_runtime_forbid(). It should be +noted, however, that if the user space has already intentionally changed the +value of /sys/devices/.../power/control to "auto" to allow the driver to power +manage the device at run time, the driver may confuse it by using +pm_runtime_forbid() this way. + +6. Runtime PM and System Sleep +============================== + +Runtime PM and system sleep (i.e., system suspend and hibernation, also known +as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of +ways. If a device is active when a system sleep starts, everything is +straightforward. But what should happen if the device is already suspended? + +The device may have different wake-up settings for runtime PM and system sleep. +For example, remote wake-up may be enabled for runtime suspend but disallowed +for system sleep (device_may_wakeup(dev) returns 'false'). When this happens, +the subsystem-level system suspend callback is responsible for changing the +device's wake-up setting (it may leave that to the device driver's system +suspend routine). It may be necessary to resume the device and suspend it again +in order to do so. The same is true if the driver uses different power levels +or other settings for runtime suspend and system sleep. + +During system resume, the simplest approach is to bring all devices back to full +power, even if they had been suspended before the system suspend began. There +are several reasons for this, including: + + * The device might need to switch power levels, wake-up settings, etc. + + * Remote wake-up events might have been lost by the firmware. + + * The device's children may need the device to be at full power in order + to resume themselves. + + * The driver's idea of the device state may not agree with the device's + physical state. This can happen during resume from hibernation. + + * The device might need to be reset. + + * Even though the device was suspended, if its usage counter was > 0 then most + likely it would need a runtime resume in the near future anyway. + +If the device had been suspended before the system suspend began and it's +brought back to full power during resume, then its runtime PM status will have +to be updated to reflect the actual post-system sleep status. The way to do +this is: + + - pm_runtime_disable(dev); + - pm_runtime_set_active(dev); + - pm_runtime_enable(dev); + +The PM core always increments the runtime usage counter before calling the +->suspend() callback and decrements it after calling the ->resume() callback. +Hence disabling runtime PM temporarily like this will not cause any runtime +suspend attempts to be permanently lost. If the usage count goes to zero +following the return of the ->resume() callback, the ->runtime_idle() callback +will be invoked as usual. + +On some systems, however, system sleep is not entered through a global firmware +or hardware operation. Instead, all hardware components are put into low-power +states directly by the kernel in a coordinated way. Then, the system sleep +state effectively follows from the states the hardware components end up in +and the system is woken up from that state by a hardware interrupt or a similar +mechanism entirely under the kernel's control. As a result, the kernel never +gives control away and the states of all devices during resume are precisely +known to it. If that is the case and none of the situations listed above takes +place (in particular, if the system is not waking up from hibernation), it may +be more efficient to leave the devices that had been suspended before the system +suspend began in the suspended state. + +To this end, the PM core provides a mechanism allowing some coordination between +different levels of device hierarchy. Namely, if a system suspend .prepare() +callback returns a positive number for a device, that indicates to the PM core +that the device appears to be runtime-suspended and its state is fine, so it +may be left in runtime suspend provided that all of its descendants are also +left in runtime suspend. If that happens, the PM core will not execute any +system suspend and resume callbacks for all of those devices, except for the +complete callback, which is then entirely responsible for handling the device +as appropriate. This only applies to system suspend transitions that are not +related to hibernation (see Documentation/driver-api/pm/devices.rst for more +information). + +The PM core does its best to reduce the probability of race conditions between +the runtime PM and system suspend/resume (and hibernation) callbacks by carrying +out the following operations: + + * During system suspend pm_runtime_get_noresume() is called for every device + right before executing the subsystem-level .prepare() callback for it and + pm_runtime_barrier() is called for every device right before executing the + subsystem-level .suspend() callback for it. In addition to that the PM core + calls __pm_runtime_disable() with 'false' as the second argument for every + device right before executing the subsystem-level .suspend_late() callback + for it. + + * During system resume pm_runtime_enable() and pm_runtime_put() are called for + every device right after executing the subsystem-level .resume_early() + callback and right after executing the subsystem-level .complete() callback + for it, respectively. + +7. Generic subsystem callbacks + +Subsystems may wish to conserve code space by using the set of generic power +management callbacks provided by the PM core, defined in +driver/base/power/generic_ops.c: + + `int pm_generic_runtime_suspend(struct device *dev);` + - invoke the ->runtime_suspend() callback provided by the driver of this + device and return its result, or return 0 if not defined + + `int pm_generic_runtime_resume(struct device *dev);` + - invoke the ->runtime_resume() callback provided by the driver of this + device and return its result, or return 0 if not defined + + `int pm_generic_suspend(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->suspend() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_suspend_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_resume(struct device *dev);` + - invoke the ->resume() callback provided by the driver of this device and, + if successful, change the device's runtime PM status to 'active' + + `int pm_generic_resume_noirq(struct device *dev);` + - invoke the ->resume_noirq() callback provided by the driver of this device + + `int pm_generic_freeze(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->freeze() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_freeze_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_thaw(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->thaw() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_thaw_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_poweroff(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->poweroff() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_poweroff_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_restore(struct device *dev);` + - invoke the ->restore() callback provided by the driver of this device and, + if successful, change the device's runtime PM status to 'active' + + `int pm_generic_restore_noirq(struct device *dev);` + - invoke the ->restore_noirq() callback provided by the device's driver + +These functions are the defaults used by the PM core, if a subsystem doesn't +provide its own callbacks for ->runtime_idle(), ->runtime_suspend(), +->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(), +->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(), +->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the +subsystem-level dev_pm_ops structure. + +Device drivers that wish to use the same function as a system suspend, freeze, +poweroff and runtime suspend callback, and similarly for system resume, thaw, +restore, and runtime resume, can achieve this with the help of the +UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its +last argument to NULL). + +8. "No-Callback" Devices +======================== + +Some "devices" are only logical sub-devices of their parent and cannot be +power-managed on their own. (The prototype example is a USB interface. Entire +USB devices can go into low-power mode or send wake-up requests, but neither is +possible for individual interfaces.) The drivers for these devices have no +need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend() +and ->runtime_resume() would always return 0 without doing anything else and +->runtime_idle() would always call pm_runtime_suspend(). + +Subsystems can tell the PM core about these devices by calling +pm_runtime_no_callbacks(). This should be done after the device structure is +initialized and before it is registered (although after device registration is +also okay). The routine will set the device's power.no_callbacks flag and +prevent the non-debugging runtime PM sysfs attributes from being created. + +When power.no_callbacks is set, the PM core will not invoke the +->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks. +Instead it will assume that suspends and resumes always succeed and that idle +devices should be suspended. + +As a consequence, the PM core will never directly inform the device's subsystem +or driver about runtime power changes. Instead, the driver for the device's +parent must take responsibility for telling the device's driver when the +parent's power state changes. + +9. Autosuspend, or automatically-delayed suspends +================================================= + +Changing a device's power state isn't free; it requires both time and energy. +A device should be put in a low-power state only when there's some reason to +think it will remain in that state for a substantial time. A common heuristic +says that a device which hasn't been used for a while is liable to remain +unused; following this advice, drivers should not allow devices to be suspended +at runtime until they have been inactive for some minimum period. Even when +the heuristic ends up being non-optimal, it will still prevent devices from +"bouncing" too rapidly between low-power and full-power states. + +The term "autosuspend" is an historical remnant. It doesn't mean that the +device is automatically suspended (the subsystem or driver still has to call +the appropriate PM routines); rather it means that runtime suspends will +automatically be delayed until the desired period of inactivity has elapsed. + +Inactivity is determined based on the power.last_busy field. Drivers should +call pm_runtime_mark_last_busy() to update this field after carrying out I/O, +typically just before calling pm_runtime_put_autosuspend(). The desired length +of the inactivity period is a matter of policy. Subsystems can set this length +initially by calling pm_runtime_set_autosuspend_delay(), but after device +registration the length should be controlled by user space, using the +/sys/devices/.../power/autosuspend_delay_ms attribute. + +In order to use autosuspend, subsystems or drivers must call +pm_runtime_use_autosuspend() (preferably before registering the device), and +thereafter they should use the various `*_autosuspend()` helper functions +instead of the non-autosuspend counterparts:: + + Instead of: pm_runtime_suspend use: pm_runtime_autosuspend; + Instead of: pm_schedule_suspend use: pm_request_autosuspend; + Instead of: pm_runtime_put use: pm_runtime_put_autosuspend; + Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend. + +Drivers may also continue to use the non-autosuspend helper functions; they +will behave normally, which means sometimes taking the autosuspend delay into +account (see pm_runtime_idle). + +Under some circumstances a driver or subsystem may want to prevent a device +from autosuspending immediately, even though the usage counter is zero and the +autosuspend delay time has expired. If the ->runtime_suspend() callback +returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is +in the future (as it normally would be if the callback invoked +pm_runtime_mark_last_busy()), the PM core will automatically reschedule the +autosuspend. The ->runtime_suspend() callback can't do this rescheduling +itself because no suspend requests of any kind are accepted while the device is +suspending (i.e., while the callback is running). + +The implementation is well suited for asynchronous use in interrupt contexts. +However such use inevitably involves races, because the PM core can't +synchronize ->runtime_suspend() callbacks with the arrival of I/O requests. +This synchronization must be handled by the driver, using its private lock. +Here is a schematic pseudo-code example:: + + foo_read_or_write(struct foo_priv *foo, void *data) + { + lock(&foo->private_lock); + add_request_to_io_queue(foo, data); + if (foo->num_pending_requests++ == 0) + pm_runtime_get(&foo->dev); + if (!foo->is_suspended) + foo_process_next_request(foo); + unlock(&foo->private_lock); + } + + foo_io_completion(struct foo_priv *foo, void *req) + { + lock(&foo->private_lock); + if (--foo->num_pending_requests == 0) { + pm_runtime_mark_last_busy(&foo->dev); + pm_runtime_put_autosuspend(&foo->dev); + } else { + foo_process_next_request(foo); + } + unlock(&foo->private_lock); + /* Send req result back to the user ... */ + } + + int foo_runtime_suspend(struct device *dev) + { + struct foo_priv foo = container_of(dev, ...); + int ret = 0; + + lock(&foo->private_lock); + if (foo->num_pending_requests > 0) { + ret = -EBUSY; + } else { + /* ... suspend the device ... */ + foo->is_suspended = 1; + } + unlock(&foo->private_lock); + return ret; + } + + int foo_runtime_resume(struct device *dev) + { + struct foo_priv foo = container_of(dev, ...); + + lock(&foo->private_lock); + /* ... resume the device ... */ + foo->is_suspended = 0; + pm_runtime_mark_last_busy(&foo->dev); + if (foo->num_pending_requests > 0) + foo_process_next_request(foo); + unlock(&foo->private_lock); + return 0; + } + +The important point is that after foo_io_completion() asks for an autosuspend, +the foo_runtime_suspend() callback may race with foo_read_or_write(). +Therefore foo_runtime_suspend() has to check whether there are any pending I/O +requests (while holding the private lock) before allowing the suspend to +proceed. + +In addition, the power.autosuspend_delay field can be changed by user space at +any time. If a driver cares about this, it can call +pm_runtime_autosuspend_expiration() from within the ->runtime_suspend() +callback while holding its private lock. If the function returns a nonzero +value then the delay has not yet expired and the callback should return +-EAGAIN. diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt deleted file mode 100644 index 937e33c46211..000000000000 --- a/Documentation/power/runtime_pm.txt +++ /dev/null @@ -1,928 +0,0 @@ -Runtime Power Management Framework for I/O Devices - -(C) 2009-2011 Rafael J. Wysocki , Novell Inc. -(C) 2010 Alan Stern -(C) 2014 Intel Corp., Rafael J. Wysocki - -1. Introduction - -Support for runtime power management (runtime PM) of I/O devices is provided -at the power management core (PM core) level by means of: - -* The power management workqueue pm_wq in which bus types and device drivers can - put their PM-related work items. It is strongly recommended that pm_wq be - used for queuing all work items related to runtime PM, because this allows - them to be synchronized with system-wide power transitions (suspend to RAM, - hibernation and resume from system sleep states). pm_wq is declared in - include/linux/pm_runtime.h and defined in kernel/power/main.c. - -* A number of runtime PM fields in the 'power' member of 'struct device' (which - is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can - be used for synchronizing runtime PM operations with one another. - -* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in - include/linux/pm.h). - -* A set of helper functions defined in drivers/base/power/runtime.c that can be - used for carrying out runtime PM operations in such a way that the - synchronization between them is taken care of by the PM core. Bus types and - device drivers are encouraged to use these functions. - -The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM -fields of 'struct dev_pm_info' and the core helper functions provided for -runtime PM are described below. - -2. Device Runtime PM Callbacks - -There are three device runtime PM callbacks defined in 'struct dev_pm_ops': - -struct dev_pm_ops { - ... - int (*runtime_suspend)(struct device *dev); - int (*runtime_resume)(struct device *dev); - int (*runtime_idle)(struct device *dev); - ... -}; - -The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks -are executed by the PM core for the device's subsystem that may be either of -the following: - - 1. PM domain of the device, if the device's PM domain object, dev->pm_domain, - is present. - - 2. Device type of the device, if both dev->type and dev->type->pm are present. - - 3. Device class of the device, if both dev->class and dev->class->pm are - present. - - 4. Bus type of the device, if both dev->bus and dev->bus->pm are present. - -If the subsystem chosen by applying the above rules doesn't provide the relevant -callback, the PM core will invoke the corresponding driver callback stored in -dev->driver->pm directly (if present). - -The PM core always checks which callback to use in the order given above, so the -priority order of callbacks from high to low is: PM domain, device type, class -and bus type. Moreover, the high-priority one will always take precedence over -a low-priority one. The PM domain, bus type, device type and class callbacks -are referred to as subsystem-level callbacks in what follows. - -By default, the callbacks are always invoked in process context with interrupts -enabled. However, the pm_runtime_irq_safe() helper function can be used to tell -the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume() -and ->runtime_idle() callbacks for the given device in atomic context with -interrupts disabled. This implies that the callback routines in question must -not block or sleep, but it also means that the synchronous helper functions -listed at the end of Section 4 may be used for that device within an interrupt -handler or generally in an atomic context. - -The subsystem-level suspend callback, if present, is _entirely_ _responsible_ -for handling the suspend of the device as appropriate, which may, but need not -include executing the device driver's own ->runtime_suspend() callback (from the -PM core's point of view it is not necessary to implement a ->runtime_suspend() -callback in a device driver as long as the subsystem-level suspend callback -knows what to do to handle the device). - - * Once the subsystem-level suspend callback (or the driver suspend callback, - if invoked directly) has completed successfully for the given device, the PM - core regards the device as suspended, which need not mean that it has been - put into a low power state. It is supposed to mean, however, that the - device will not process data and will not communicate with the CPU(s) and - RAM until the appropriate resume callback is executed for it. The runtime - PM status of a device after successful execution of the suspend callback is - 'suspended'. - - * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM - status remains 'active', which means that the device _must_ be fully - operational afterwards. - - * If the suspend callback returns an error code different from -EBUSY and - -EAGAIN, the PM core regards this as a fatal error and will refuse to run - the helper functions described in Section 4 for the device until its status - is directly set to either 'active', or 'suspended' (the PM core provides - special helper functions for this purpose). - -In particular, if the driver requires remote wakeup capability (i.e. hardware -mechanism allowing the device to request a change of its power state, such as -PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the -device, then ->runtime_suspend() should return -EBUSY. On the other hand, if -device_can_wakeup() returns 'true' for the device and the device is put into a -low-power state during the execution of the suspend callback, it is expected -that remote wakeup will be enabled for the device. Generally, remote wakeup -should be enabled for all input devices put into low-power states at run time. - -The subsystem-level resume callback, if present, is _entirely_ _responsible_ for -handling the resume of the device as appropriate, which may, but need not -include executing the device driver's own ->runtime_resume() callback (from the -PM core's point of view it is not necessary to implement a ->runtime_resume() -callback in a device driver as long as the subsystem-level resume callback knows -what to do to handle the device). - - * Once the subsystem-level resume callback (or the driver resume callback, if - invoked directly) has completed successfully, the PM core regards the device - as fully operational, which means that the device _must_ be able to complete - I/O operations as needed. The runtime PM status of the device is then - 'active'. - - * If the resume callback returns an error code, the PM core regards this as a - fatal error and will refuse to run the helper functions described in Section - 4 for the device, until its status is directly set to either 'active', or - 'suspended' (by means of special helper functions provided by the PM core - for this purpose). - -The idle callback (a subsystem-level one, if present, or the driver one) is -executed by the PM core whenever the device appears to be idle, which is -indicated to the PM core by two counters, the device's usage counter and the -counter of 'active' children of the device. - - * If any of these counters is decreased using a helper function provided by - the PM core and it turns out to be equal to zero, the other counter is - checked. If that counter also is equal to zero, the PM core executes the - idle callback with the device as its argument. - -The action performed by the idle callback is totally dependent on the subsystem -(or driver) in question, but the expected and recommended action is to check -if the device can be suspended (i.e. if all of the conditions necessary for -suspending the device are satisfied) and to queue up a suspend request for the -device in that case. If there is no idle callback, or if the callback returns -0, then the PM core will attempt to carry out a runtime suspend of the device, -also respecting devices configured for autosuspend. In essence this means a -call to pm_runtime_autosuspend() (do note that drivers needs to update the -device last busy mark, pm_runtime_mark_last_busy(), to control the delay under -this circumstance). To prevent this (for example, if the callback routine has -started a delayed suspend), the routine must return a non-zero value. Negative -error return codes are ignored by the PM core. - -The helper functions provided by the PM core, described in Section 4, guarantee -that the following constraints are met with respect to runtime PM callbacks for -one device: - -(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute - ->runtime_suspend() in parallel with ->runtime_resume() or with another - instance of ->runtime_suspend() for the same device) with the exception that - ->runtime_suspend() or ->runtime_resume() can be executed in parallel with - ->runtime_idle() (although ->runtime_idle() will not be started while any - of the other callbacks is being executed for the same device). - -(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' - devices (i.e. the PM core will only execute ->runtime_idle() or - ->runtime_suspend() for the devices the runtime PM status of which is - 'active'). - -(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device - the usage counter of which is equal to zero _and_ either the counter of - 'active' children of which is equal to zero, or the 'power.ignore_children' - flag of which is set. - -(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the - PM core will only execute ->runtime_resume() for the devices the runtime - PM status of which is 'suspended'). - -Additionally, the helper functions provided by the PM core obey the following -rules: - - * If ->runtime_suspend() is about to be executed or there's a pending request - to execute it, ->runtime_idle() will not be executed for the same device. - - * A request to execute or to schedule the execution of ->runtime_suspend() - will cancel any pending requests to execute ->runtime_idle() for the same - device. - - * If ->runtime_resume() is about to be executed or there's a pending request - to execute it, the other callbacks will not be executed for the same device. - - * A request to execute ->runtime_resume() will cancel any pending or - scheduled requests to execute the other callbacks for the same device, - except for scheduled autosuspends. - -3. Runtime PM Device Fields - -The following device runtime PM fields are present in 'struct dev_pm_info', as -defined in include/linux/pm.h: - - struct timer_list suspend_timer; - - timer used for scheduling (delayed) suspend and autosuspend requests - - unsigned long timer_expires; - - timer expiration time, in jiffies (if this is different from zero, the - timer is running and will expire at that time, otherwise the timer is not - running) - - struct work_struct work; - - work structure used for queuing up requests (i.e. work items in pm_wq) - - wait_queue_head_t wait_queue; - - wait queue used if any of the helper functions needs to wait for another - one to complete - - spinlock_t lock; - - lock used for synchronization - - atomic_t usage_count; - - the usage counter of the device - - atomic_t child_count; - - the count of 'active' children of the device - - unsigned int ignore_children; - - if set, the value of child_count is ignored (but still updated) - - unsigned int disable_depth; - - used for disabling the helper functions (they work normally if this is - equal to zero); the initial value of it is 1 (i.e. runtime PM is - initially disabled for all devices) - - int runtime_error; - - if set, there was a fatal error (one of the callbacks returned error code - as described in Section 2), so the helper functions will not work until - this flag is cleared; this is the error code returned by the failing - callback - - unsigned int idle_notification; - - if set, ->runtime_idle() is being executed - - unsigned int request_pending; - - if set, there's a pending request (i.e. a work item queued up into pm_wq) - - enum rpm_request request; - - type of request that's pending (valid if request_pending is set) - - unsigned int deferred_resume; - - set if ->runtime_resume() is about to be run while ->runtime_suspend() is - being executed for that device and it is not practical to wait for the - suspend to complete; means "start a resume as soon as you've suspended" - - enum rpm_status runtime_status; - - the runtime PM status of the device; this field's initial value is - RPM_SUSPENDED, which means that each device is initially regarded by the - PM core as 'suspended', regardless of its real hardware status - - unsigned int runtime_auto; - - if set, indicates that the user space has allowed the device driver to - power manage the device at run time via the /sys/devices/.../power/control - interface; it may only be modified with the help of the pm_runtime_allow() - and pm_runtime_forbid() helper functions - - unsigned int no_callbacks; - - indicates that the device does not use the runtime PM callbacks (see - Section 8); it may be modified only by the pm_runtime_no_callbacks() - helper function - - unsigned int irq_safe; - - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks - will be invoked with the spinlock held and interrupts disabled - - unsigned int use_autosuspend; - - indicates that the device's driver supports delayed autosuspend (see - Section 9); it may be modified only by the - pm_runtime{_dont}_use_autosuspend() helper functions - - unsigned int timer_autosuspends; - - indicates that the PM core should attempt to carry out an autosuspend - when the timer expires rather than a normal suspend - - int autosuspend_delay; - - the delay time (in milliseconds) to be used for autosuspend - - unsigned long last_busy; - - the time (in jiffies) when the pm_runtime_mark_last_busy() helper - function was last called for this device; used in calculating inactivity - periods for autosuspend - -All of the above fields are members of the 'power' member of 'struct device'. - -4. Runtime PM Device Helper Functions - -The following runtime PM helper functions are defined in -drivers/base/power/runtime.c and include/linux/pm_runtime.h: - - void pm_runtime_init(struct device *dev); - - initialize the device runtime PM fields in 'struct dev_pm_info' - - void pm_runtime_remove(struct device *dev); - - make sure that the runtime PM of the device will be disabled after - removing the device from device hierarchy - - int pm_runtime_idle(struct device *dev); - - execute the subsystem-level idle callback for the device; returns an - error code on failure, where -EINPROGRESS means that ->runtime_idle() is - already being executed; if there is no callback or the callback returns 0 - then run pm_runtime_autosuspend(dev) and return its result - - int pm_runtime_suspend(struct device *dev); - - execute the subsystem-level suspend callback for the device; returns 0 on - success, 1 if the device's runtime PM status was already 'suspended', or - error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt - to suspend the device again in future and -EACCES means that - 'power.disable_depth' is different from 0 - - int pm_runtime_autosuspend(struct device *dev); - - same as pm_runtime_suspend() except that the autosuspend delay is taken - into account; if pm_runtime_autosuspend_expiration() says the delay has - not yet expired then an autosuspend is scheduled for the appropriate time - and 0 is returned - - int pm_runtime_resume(struct device *dev); - - execute the subsystem-level resume callback for the device; returns 0 on - success, 1 if the device's runtime PM status was already 'active' or - error code on failure, where -EAGAIN means it may be safe to attempt to - resume the device again in future, but 'power.runtime_error' should be - checked additionally, and -EACCES means that 'power.disable_depth' is - different from 0 - - int pm_request_idle(struct device *dev); - - submit a request to execute the subsystem-level idle callback for the - device (the request is represented by a work item in pm_wq); returns 0 on - success or error code if the request has not been queued up - - int pm_request_autosuspend(struct device *dev); - - schedule the execution of the subsystem-level suspend callback for the - device when the autosuspend delay has expired; if the delay has already - expired then the work item is queued up immediately - - int pm_schedule_suspend(struct device *dev, unsigned int delay); - - schedule the execution of the subsystem-level suspend callback for the - device in future, where 'delay' is the time to wait before queuing up a - suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work - item is queued up immediately); returns 0 on success, 1 if the device's PM - runtime status was already 'suspended', or error code if the request - hasn't been scheduled (or queued up if 'delay' is 0); if the execution of - ->runtime_suspend() is already scheduled and not yet expired, the new - value of 'delay' will be used as the time to wait - - int pm_request_resume(struct device *dev); - - submit a request to execute the subsystem-level resume callback for the - device (the request is represented by a work item in pm_wq); returns 0 on - success, 1 if the device's runtime PM status was already 'active', or - error code if the request hasn't been queued up - - void pm_runtime_get_noresume(struct device *dev); - - increment the device's usage counter - - int pm_runtime_get(struct device *dev); - - increment the device's usage counter, run pm_request_resume(dev) and - return its result - - int pm_runtime_get_sync(struct device *dev); - - increment the device's usage counter, run pm_runtime_resume(dev) and - return its result - - int pm_runtime_get_if_in_use(struct device *dev); - - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the - runtime PM status is RPM_ACTIVE and the runtime PM usage counter is - nonzero, increment the counter and return 1; otherwise return 0 without - changing the counter - - void pm_runtime_put_noidle(struct device *dev); - - decrement the device's usage counter - - int pm_runtime_put(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_request_idle(dev) and return its result - - int pm_runtime_put_autosuspend(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_request_autosuspend(dev) and return its result - - int pm_runtime_put_sync(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_runtime_idle(dev) and return its result - - int pm_runtime_put_sync_suspend(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_runtime_suspend(dev) and return its result - - int pm_runtime_put_sync_autosuspend(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_runtime_autosuspend(dev) and return its result - - void pm_runtime_enable(struct device *dev); - - decrement the device's 'power.disable_depth' field; if that field is equal - to zero, the runtime PM helper functions can execute subsystem-level - callbacks described in Section 2 for the device - - int pm_runtime_disable(struct device *dev); - - increment the device's 'power.disable_depth' field (if the value of that - field was previously zero, this prevents subsystem-level runtime PM - callbacks from being run for the device), make sure that all of the - pending runtime PM operations on the device are either completed or - canceled; returns 1 if there was a resume request pending and it was - necessary to execute the subsystem-level resume callback for the device - to satisfy that request, otherwise 0 is returned - - int pm_runtime_barrier(struct device *dev); - - check if there's a resume request pending for the device and resume it - (synchronously) in that case, cancel any other pending runtime PM requests - regarding it and wait for all runtime PM operations on it in progress to - complete; returns 1 if there was a resume request pending and it was - necessary to execute the subsystem-level resume callback for the device to - satisfy that request, otherwise 0 is returned - - void pm_suspend_ignore_children(struct device *dev, bool enable); - - set/unset the power.ignore_children flag of the device - - int pm_runtime_set_active(struct device *dev); - - clear the device's 'power.runtime_error' flag, set the device's runtime - PM status to 'active' and update its parent's counter of 'active' - children as appropriate (it is only valid to use this function if - 'power.runtime_error' is set or 'power.disable_depth' is greater than - zero); it will fail and return error code if the device has a parent - which is not active and the 'power.ignore_children' flag of which is unset - - void pm_runtime_set_suspended(struct device *dev); - - clear the device's 'power.runtime_error' flag, set the device's runtime - PM status to 'suspended' and update its parent's counter of 'active' - children as appropriate (it is only valid to use this function if - 'power.runtime_error' is set or 'power.disable_depth' is greater than - zero) - - bool pm_runtime_active(struct device *dev); - - return true if the device's runtime PM status is 'active' or its - 'power.disable_depth' field is not equal to zero, or false otherwise - - bool pm_runtime_suspended(struct device *dev); - - return true if the device's runtime PM status is 'suspended' and its - 'power.disable_depth' field is equal to zero, or false otherwise - - bool pm_runtime_status_suspended(struct device *dev); - - return true if the device's runtime PM status is 'suspended' - - void pm_runtime_allow(struct device *dev); - - set the power.runtime_auto flag for the device and decrease its usage - counter (used by the /sys/devices/.../power/control interface to - effectively allow the device to be power managed at run time) - - void pm_runtime_forbid(struct device *dev); - - unset the power.runtime_auto flag for the device and increase its usage - counter (used by the /sys/devices/.../power/control interface to - effectively prevent the device from being power managed at run time) - - void pm_runtime_no_callbacks(struct device *dev); - - set the power.no_callbacks flag for the device and remove the runtime - PM attributes from /sys/devices/.../power (or prevent them from being - added when the device is registered) - - void pm_runtime_irq_safe(struct device *dev); - - set the power.irq_safe flag for the device, causing the runtime-PM - callbacks to be invoked with interrupts off - - bool pm_runtime_is_irq_safe(struct device *dev); - - return true if power.irq_safe flag was set for the device, causing - the runtime-PM callbacks to be invoked with interrupts off - - void pm_runtime_mark_last_busy(struct device *dev); - - set the power.last_busy field to the current time - - void pm_runtime_use_autosuspend(struct device *dev); - - set the power.use_autosuspend flag, enabling autosuspend delays; call - pm_runtime_get_sync if the flag was previously cleared and - power.autosuspend_delay is negative - - void pm_runtime_dont_use_autosuspend(struct device *dev); - - clear the power.use_autosuspend flag, disabling autosuspend delays; - decrement the device's usage counter if the flag was previously set and - power.autosuspend_delay is negative; call pm_runtime_idle - - void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); - - set the power.autosuspend_delay value to 'delay' (expressed in - milliseconds); if 'delay' is negative then runtime suspends are - prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be - called or the device's usage counter may be decremented and - pm_runtime_idle called depending on if power.autosuspend_delay is - changed to or from a negative value; if power.use_autosuspend is clear, - pm_runtime_idle is called - - unsigned long pm_runtime_autosuspend_expiration(struct device *dev); - - calculate the time when the current autosuspend delay period will expire, - based on power.last_busy and power.autosuspend_delay; if the delay time - is 1000 ms or larger then the expiration time is rounded up to the - nearest second; returns 0 if the delay period has already expired or - power.use_autosuspend isn't set, otherwise returns the expiration time - in jiffies - -It is safe to execute the following helper functions from interrupt context: - -pm_request_idle() -pm_request_autosuspend() -pm_schedule_suspend() -pm_request_resume() -pm_runtime_get_noresume() -pm_runtime_get() -pm_runtime_put_noidle() -pm_runtime_put() -pm_runtime_put_autosuspend() -pm_runtime_enable() -pm_suspend_ignore_children() -pm_runtime_set_active() -pm_runtime_set_suspended() -pm_runtime_suspended() -pm_runtime_mark_last_busy() -pm_runtime_autosuspend_expiration() - -If pm_runtime_irq_safe() has been called for a device then the following helper -functions may also be used in interrupt context: - -pm_runtime_idle() -pm_runtime_suspend() -pm_runtime_autosuspend() -pm_runtime_resume() -pm_runtime_get_sync() -pm_runtime_put_sync() -pm_runtime_put_sync_suspend() -pm_runtime_put_sync_autosuspend() - -5. Runtime PM Initialization, Device Probing and Removal - -Initially, the runtime PM is disabled for all devices, which means that the -majority of the runtime PM helper functions described in Section 4 will return --EAGAIN until pm_runtime_enable() is called for the device. - -In addition to that, the initial runtime PM status of all devices is -'suspended', but it need not reflect the actual physical state of the device. -Thus, if the device is initially active (i.e. it is able to process I/O), its -runtime PM status must be changed to 'active', with the help of -pm_runtime_set_active(), before pm_runtime_enable() is called for the device. - -However, if the device has a parent and the parent's runtime PM is enabled, -calling pm_runtime_set_active() for the device will affect the parent, unless -the parent's 'power.ignore_children' flag is set. Namely, in that case the -parent won't be able to suspend at run time, using the PM core's helper -functions, as long as the child's status is 'active', even if the child's -runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for -the child yet or pm_runtime_disable() has been called for it). For this reason, -once pm_runtime_set_active() has been called for the device, pm_runtime_enable() -should be called for it too as soon as reasonably possible or its runtime PM -status should be changed back to 'suspended' with the help of -pm_runtime_set_suspended(). - -If the default initial runtime PM status of the device (i.e. 'suspended') -reflects the actual state of the device, its bus type's or its driver's -->probe() callback will likely need to wake it up using one of the PM core's -helper functions described in Section 4. In that case, pm_runtime_resume() -should be used. Of course, for this purpose the device's runtime PM has to be -enabled earlier by calling pm_runtime_enable(). - -Note, if the device may execute pm_runtime calls during the probe (such as -if it is registers with a subsystem that may call back in) then the -pm_runtime_get_sync() call paired with a pm_runtime_put() call will be -appropriate to ensure that the device is not put back to sleep during the -probe. This can happen with systems such as the network device layer. - -It may be desirable to suspend the device once ->probe() has finished. -Therefore the driver core uses the asynchronous pm_request_idle() to submit a -request to execute the subsystem-level idle callback for the device at that -time. A driver that makes use of the runtime autosuspend feature, may want to -update the last busy mark before returning from ->probe(). - -Moreover, the driver core prevents runtime PM callbacks from racing with the bus -notifier callback in __device_release_driver(), which is necessary, because the -notifier is used by some subsystems to carry out operations affecting the -runtime PM functionality. It does so by calling pm_runtime_get_sync() before -driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This -resumes the device if it's in the suspended state and prevents it from -being suspended again while those routines are being executed. - -To allow bus types and drivers to put devices into the suspended state by -calling pm_runtime_suspend() from their ->remove() routines, the driver core -executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER -notifications in __device_release_driver(). This requires bus types and -drivers to make their ->remove() callbacks avoid races with runtime PM directly, -but also it allows of more flexibility in the handling of devices during the -removal of their drivers. - -Drivers in ->remove() callback should undo the runtime PM changes done -in ->probe(). Usually this means calling pm_runtime_disable(), -pm_runtime_dont_use_autosuspend() etc. - -The user space can effectively disallow the driver of the device to power manage -it at run time by changing the value of its /sys/devices/.../power/control -attribute to "on", which causes pm_runtime_forbid() to be called. In principle, -this mechanism may also be used by the driver to effectively turn off the -runtime power management of the device until the user space turns it on. -Namely, during the initialization the driver can make sure that the runtime PM -status of the device is 'active' and call pm_runtime_forbid(). It should be -noted, however, that if the user space has already intentionally changed the -value of /sys/devices/.../power/control to "auto" to allow the driver to power -manage the device at run time, the driver may confuse it by using -pm_runtime_forbid() this way. - -6. Runtime PM and System Sleep - -Runtime PM and system sleep (i.e., system suspend and hibernation, also known -as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of -ways. If a device is active when a system sleep starts, everything is -straightforward. But what should happen if the device is already suspended? - -The device may have different wake-up settings for runtime PM and system sleep. -For example, remote wake-up may be enabled for runtime suspend but disallowed -for system sleep (device_may_wakeup(dev) returns 'false'). When this happens, -the subsystem-level system suspend callback is responsible for changing the -device's wake-up setting (it may leave that to the device driver's system -suspend routine). It may be necessary to resume the device and suspend it again -in order to do so. The same is true if the driver uses different power levels -or other settings for runtime suspend and system sleep. - -During system resume, the simplest approach is to bring all devices back to full -power, even if they had been suspended before the system suspend began. There -are several reasons for this, including: - - * The device might need to switch power levels, wake-up settings, etc. - - * Remote wake-up events might have been lost by the firmware. - - * The device's children may need the device to be at full power in order - to resume themselves. - - * The driver's idea of the device state may not agree with the device's - physical state. This can happen during resume from hibernation. - - * The device might need to be reset. - - * Even though the device was suspended, if its usage counter was > 0 then most - likely it would need a runtime resume in the near future anyway. - -If the device had been suspended before the system suspend began and it's -brought back to full power during resume, then its runtime PM status will have -to be updated to reflect the actual post-system sleep status. The way to do -this is: - - pm_runtime_disable(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - -The PM core always increments the runtime usage counter before calling the -->suspend() callback and decrements it after calling the ->resume() callback. -Hence disabling runtime PM temporarily like this will not cause any runtime -suspend attempts to be permanently lost. If the usage count goes to zero -following the return of the ->resume() callback, the ->runtime_idle() callback -will be invoked as usual. - -On some systems, however, system sleep is not entered through a global firmware -or hardware operation. Instead, all hardware components are put into low-power -states directly by the kernel in a coordinated way. Then, the system sleep -state effectively follows from the states the hardware components end up in -and the system is woken up from that state by a hardware interrupt or a similar -mechanism entirely under the kernel's control. As a result, the kernel never -gives control away and the states of all devices during resume are precisely -known to it. If that is the case and none of the situations listed above takes -place (in particular, if the system is not waking up from hibernation), it may -be more efficient to leave the devices that had been suspended before the system -suspend began in the suspended state. - -To this end, the PM core provides a mechanism allowing some coordination between -different levels of device hierarchy. Namely, if a system suspend .prepare() -callback returns a positive number for a device, that indicates to the PM core -that the device appears to be runtime-suspended and its state is fine, so it -may be left in runtime suspend provided that all of its descendants are also -left in runtime suspend. If that happens, the PM core will not execute any -system suspend and resume callbacks for all of those devices, except for the -complete callback, which is then entirely responsible for handling the device -as appropriate. This only applies to system suspend transitions that are not -related to hibernation (see Documentation/driver-api/pm/devices.rst for more -information). - -The PM core does its best to reduce the probability of race conditions between -the runtime PM and system suspend/resume (and hibernation) callbacks by carrying -out the following operations: - - * During system suspend pm_runtime_get_noresume() is called for every device - right before executing the subsystem-level .prepare() callback for it and - pm_runtime_barrier() is called for every device right before executing the - subsystem-level .suspend() callback for it. In addition to that the PM core - calls __pm_runtime_disable() with 'false' as the second argument for every - device right before executing the subsystem-level .suspend_late() callback - for it. - - * During system resume pm_runtime_enable() and pm_runtime_put() are called for - every device right after executing the subsystem-level .resume_early() - callback and right after executing the subsystem-level .complete() callback - for it, respectively. - -7. Generic subsystem callbacks - -Subsystems may wish to conserve code space by using the set of generic power -management callbacks provided by the PM core, defined in -driver/base/power/generic_ops.c: - - int pm_generic_runtime_suspend(struct device *dev); - - invoke the ->runtime_suspend() callback provided by the driver of this - device and return its result, or return 0 if not defined - - int pm_generic_runtime_resume(struct device *dev); - - invoke the ->runtime_resume() callback provided by the driver of this - device and return its result, or return 0 if not defined - - int pm_generic_suspend(struct device *dev); - - if the device has not been suspended at run time, invoke the ->suspend() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_suspend_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_resume(struct device *dev); - - invoke the ->resume() callback provided by the driver of this device and, - if successful, change the device's runtime PM status to 'active' - - int pm_generic_resume_noirq(struct device *dev); - - invoke the ->resume_noirq() callback provided by the driver of this device - - int pm_generic_freeze(struct device *dev); - - if the device has not been suspended at run time, invoke the ->freeze() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_freeze_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_thaw(struct device *dev); - - if the device has not been suspended at run time, invoke the ->thaw() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_thaw_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_poweroff(struct device *dev); - - if the device has not been suspended at run time, invoke the ->poweroff() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_poweroff_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_restore(struct device *dev); - - invoke the ->restore() callback provided by the driver of this device and, - if successful, change the device's runtime PM status to 'active' - - int pm_generic_restore_noirq(struct device *dev); - - invoke the ->restore_noirq() callback provided by the device's driver - -These functions are the defaults used by the PM core, if a subsystem doesn't -provide its own callbacks for ->runtime_idle(), ->runtime_suspend(), -->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(), -->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(), -->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the -subsystem-level dev_pm_ops structure. - -Device drivers that wish to use the same function as a system suspend, freeze, -poweroff and runtime suspend callback, and similarly for system resume, thaw, -restore, and runtime resume, can achieve this with the help of the -UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its -last argument to NULL). - -8. "No-Callback" Devices - -Some "devices" are only logical sub-devices of their parent and cannot be -power-managed on their own. (The prototype example is a USB interface. Entire -USB devices can go into low-power mode or send wake-up requests, but neither is -possible for individual interfaces.) The drivers for these devices have no -need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend() -and ->runtime_resume() would always return 0 without doing anything else and -->runtime_idle() would always call pm_runtime_suspend(). - -Subsystems can tell the PM core about these devices by calling -pm_runtime_no_callbacks(). This should be done after the device structure is -initialized and before it is registered (although after device registration is -also okay). The routine will set the device's power.no_callbacks flag and -prevent the non-debugging runtime PM sysfs attributes from being created. - -When power.no_callbacks is set, the PM core will not invoke the -->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks. -Instead it will assume that suspends and resumes always succeed and that idle -devices should be suspended. - -As a consequence, the PM core will never directly inform the device's subsystem -or driver about runtime power changes. Instead, the driver for the device's -parent must take responsibility for telling the device's driver when the -parent's power state changes. - -9. Autosuspend, or automatically-delayed suspends - -Changing a device's power state isn't free; it requires both time and energy. -A device should be put in a low-power state only when there's some reason to -think it will remain in that state for a substantial time. A common heuristic -says that a device which hasn't been used for a while is liable to remain -unused; following this advice, drivers should not allow devices to be suspended -at runtime until they have been inactive for some minimum period. Even when -the heuristic ends up being non-optimal, it will still prevent devices from -"bouncing" too rapidly between low-power and full-power states. - -The term "autosuspend" is an historical remnant. It doesn't mean that the -device is automatically suspended (the subsystem or driver still has to call -the appropriate PM routines); rather it means that runtime suspends will -automatically be delayed until the desired period of inactivity has elapsed. - -Inactivity is determined based on the power.last_busy field. Drivers should -call pm_runtime_mark_last_busy() to update this field after carrying out I/O, -typically just before calling pm_runtime_put_autosuspend(). The desired length -of the inactivity period is a matter of policy. Subsystems can set this length -initially by calling pm_runtime_set_autosuspend_delay(), but after device -registration the length should be controlled by user space, using the -/sys/devices/.../power/autosuspend_delay_ms attribute. - -In order to use autosuspend, subsystems or drivers must call -pm_runtime_use_autosuspend() (preferably before registering the device), and -thereafter they should use the various *_autosuspend() helper functions instead -of the non-autosuspend counterparts: - - Instead of: pm_runtime_suspend use: pm_runtime_autosuspend; - Instead of: pm_schedule_suspend use: pm_request_autosuspend; - Instead of: pm_runtime_put use: pm_runtime_put_autosuspend; - Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend. - -Drivers may also continue to use the non-autosuspend helper functions; they -will behave normally, which means sometimes taking the autosuspend delay into -account (see pm_runtime_idle). - -Under some circumstances a driver or subsystem may want to prevent a device -from autosuspending immediately, even though the usage counter is zero and the -autosuspend delay time has expired. If the ->runtime_suspend() callback -returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is -in the future (as it normally would be if the callback invoked -pm_runtime_mark_last_busy()), the PM core will automatically reschedule the -autosuspend. The ->runtime_suspend() callback can't do this rescheduling -itself because no suspend requests of any kind are accepted while the device is -suspending (i.e., while the callback is running). - -The implementation is well suited for asynchronous use in interrupt contexts. -However such use inevitably involves races, because the PM core can't -synchronize ->runtime_suspend() callbacks with the arrival of I/O requests. -This synchronization must be handled by the driver, using its private lock. -Here is a schematic pseudo-code example: - - foo_read_or_write(struct foo_priv *foo, void *data) - { - lock(&foo->private_lock); - add_request_to_io_queue(foo, data); - if (foo->num_pending_requests++ == 0) - pm_runtime_get(&foo->dev); - if (!foo->is_suspended) - foo_process_next_request(foo); - unlock(&foo->private_lock); - } - - foo_io_completion(struct foo_priv *foo, void *req) - { - lock(&foo->private_lock); - if (--foo->num_pending_requests == 0) { - pm_runtime_mark_last_busy(&foo->dev); - pm_runtime_put_autosuspend(&foo->dev); - } else { - foo_process_next_request(foo); - } - unlock(&foo->private_lock); - /* Send req result back to the user ... */ - } - - int foo_runtime_suspend(struct device *dev) - { - struct foo_priv foo = container_of(dev, ...); - int ret = 0; - - lock(&foo->private_lock); - if (foo->num_pending_requests > 0) { - ret = -EBUSY; - } else { - /* ... suspend the device ... */ - foo->is_suspended = 1; - } - unlock(&foo->private_lock); - return ret; - } - - int foo_runtime_resume(struct device *dev) - { - struct foo_priv foo = container_of(dev, ...); - - lock(&foo->private_lock); - /* ... resume the device ... */ - foo->is_suspended = 0; - pm_runtime_mark_last_busy(&foo->dev); - if (foo->num_pending_requests > 0) - foo_process_next_request(foo); - unlock(&foo->private_lock); - return 0; - } - -The important point is that after foo_io_completion() asks for an autosuspend, -the foo_runtime_suspend() callback may race with foo_read_or_write(). -Therefore foo_runtime_suspend() has to check whether there are any pending I/O -requests (while holding the private lock) before allowing the suspend to -proceed. - -In addition, the power.autosuspend_delay field can be changed by user space at -any time. If a driver cares about this, it can call -pm_runtime_autosuspend_expiration() from within the ->runtime_suspend() -callback while holding its private lock. If the function returns a nonzero -value then the delay has not yet expired and the callback should return --EAGAIN. diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst new file mode 100644 index 000000000000..d739aa7c742c --- /dev/null +++ b/Documentation/power/s2ram.rst @@ -0,0 +1,87 @@ +======================== +How to get s2ram working +======================== + +2006 Linus Torvalds +2006 Pavel Machek + +1) Check suspend.sf.net, program s2ram there has long whitelist of + "known ok" machines, along with tricks to use on each one. + +2) If that does not help, try reading tricks.txt and + video.txt. Perhaps problem is as simple as broken module, and + simple module unload can fix it. + +3) You can use Linus' TRACE_RESUME infrastructure, described below. + +Using TRACE_RESUME +~~~~~~~~~~~~~~~~~~ + +I've been working at making the machines I have able to STR, and almost +always it's a driver that is buggy. Thank God for the suspend/resume +debugging - the thing that Chuck tried to disable. That's often the _only_ +way to debug these things, and it's actually pretty powerful (but +time-consuming - having to insert TRACE_RESUME() markers into the device +driver that doesn't resume and recompile and reboot). + +Anyway, the way to debug this for people who are interested (have a +machine that doesn't boot) is: + + - enable PM_DEBUG, and PM_TRACE + + - use a script like this:: + + #!/bin/sh + sync + echo 1 > /sys/power/pm_trace + echo mem > /sys/power/state + + to suspend + + - if it doesn't come back up (which is usually the problem), reboot by + holding the power button down, and look at the dmesg output for things + like:: + + Magic number: 4:156:725 + hash matches drivers/base/power/resume.c:28 + hash matches device 0000:01:00.0 + + which means that the last trace event was just before trying to resume + device 0000:01:00.0. Then figure out what driver is controlling that + device (lspci and /sys/devices/pci* is your friend), and see if you can + fix it, disable it, or trace into its resume function. + + If no device matches the hash (or any matches appear to be false positives), + the culprit may be a device from a loadable kernel module that is not loaded + until after the hash is checked. You can check the hash against the current + devices again after more modules are loaded using sysfs:: + + cat /sys/power/pm_trace_dev_match + +For example, the above happens to be the VGA device on my EVO, which I +used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out +that "radeonfb" simply cannot resume that device - it tries to set the +PLL's, and it just _hangs_. Using the regular VGA console and letting X +resume it instead works fine. + +NOTE +==== +pm_trace uses the system's Real Time Clock (RTC) to save the magic number. +Reason for this is that the RTC is the only reliably available piece of +hardware during resume operations where a value can be set that will +survive a reboot. + +pm_trace is not compatible with asynchronous suspend, so it turns +asynchronous suspend off (which may work around timing or +ordering-sensitive bugs). + +Consequence is that after a resume (even if it is successful) your system +clock will have a value corresponding to the magic number instead of the +correct date/time! It is therefore advisable to use a program like ntp-date +or rdate to reset the correct date/time from an external time source when +using this trace option. + +As the clock keeps ticking it is also essential that the reboot is done +quickly after the resume failure. The trace option does not use the seconds +or the low order bits of the minutes of the RTC, but a too long delay will +corrupt the magic value. diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt deleted file mode 100644 index 4685aee197fd..000000000000 --- a/Documentation/power/s2ram.txt +++ /dev/null @@ -1,85 +0,0 @@ - How to get s2ram working - ~~~~~~~~~~~~~~~~~~~~~~~~ - 2006 Linus Torvalds - 2006 Pavel Machek - -1) Check suspend.sf.net, program s2ram there has long whitelist of - "known ok" machines, along with tricks to use on each one. - -2) If that does not help, try reading tricks.txt and - video.txt. Perhaps problem is as simple as broken module, and - simple module unload can fix it. - -3) You can use Linus' TRACE_RESUME infrastructure, described below. - - Using TRACE_RESUME - ~~~~~~~~~~~~~~~~~~ - -I've been working at making the machines I have able to STR, and almost -always it's a driver that is buggy. Thank God for the suspend/resume -debugging - the thing that Chuck tried to disable. That's often the _only_ -way to debug these things, and it's actually pretty powerful (but -time-consuming - having to insert TRACE_RESUME() markers into the device -driver that doesn't resume and recompile and reboot). - -Anyway, the way to debug this for people who are interested (have a -machine that doesn't boot) is: - - - enable PM_DEBUG, and PM_TRACE - - - use a script like this: - - #!/bin/sh - sync - echo 1 > /sys/power/pm_trace - echo mem > /sys/power/state - - to suspend - - - if it doesn't come back up (which is usually the problem), reboot by - holding the power button down, and look at the dmesg output for things - like - - Magic number: 4:156:725 - hash matches drivers/base/power/resume.c:28 - hash matches device 0000:01:00.0 - - which means that the last trace event was just before trying to resume - device 0000:01:00.0. Then figure out what driver is controlling that - device (lspci and /sys/devices/pci* is your friend), and see if you can - fix it, disable it, or trace into its resume function. - - If no device matches the hash (or any matches appear to be false positives), - the culprit may be a device from a loadable kernel module that is not loaded - until after the hash is checked. You can check the hash against the current - devices again after more modules are loaded using sysfs: - - cat /sys/power/pm_trace_dev_match - -For example, the above happens to be the VGA device on my EVO, which I -used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out -that "radeonfb" simply cannot resume that device - it tries to set the -PLL's, and it just _hangs_. Using the regular VGA console and letting X -resume it instead works fine. - -NOTE -==== -pm_trace uses the system's Real Time Clock (RTC) to save the magic number. -Reason for this is that the RTC is the only reliably available piece of -hardware during resume operations where a value can be set that will -survive a reboot. - -pm_trace is not compatible with asynchronous suspend, so it turns -asynchronous suspend off (which may work around timing or -ordering-sensitive bugs). - -Consequence is that after a resume (even if it is successful) your system -clock will have a value corresponding to the magic number instead of the -correct date/time! It is therefore advisable to use a program like ntp-date -or rdate to reset the correct date/time from an external time source when -using this trace option. - -As the clock keeps ticking it is also essential that the reboot is done -quickly after the resume failure. The trace option does not use the seconds -or the low order bits of the minutes of the RTC, but a too long delay will -corrupt the magic value. diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst new file mode 100644 index 000000000000..7ac8e1f549f4 --- /dev/null +++ b/Documentation/power/suspend-and-cpuhotplug.rst @@ -0,0 +1,286 @@ +==================================================================== +Interaction of Suspend code (S3) with the CPU hotplug infrastructure +==================================================================== + +(C) 2011 - 2014 Srivatsa S. Bhat + + +I. Differences between CPU hotplug and Suspend-to-RAM +====================================================== + +How does the regular CPU hotplug code differ from how the Suspend-to-RAM +infrastructure uses it internally? And where do they share common code? + +Well, a picture is worth a thousand words... So ASCII art follows :-) + +[This depicts the current design in the kernel, and focusses only on the +interactions involving the freezer and CPU hotplug and also tries to explain +the locking involved. It outlines the notifications involved as well. +But please note that here, only the call paths are illustrated, with the aim +of describing where they take different paths and where they share code. +What happens when regular CPU hotplug and Suspend-to-RAM race with each other +is not depicted here.] + +On a high level, the suspend-resume cycle goes like this:: + + |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw | + |tasks | | cpus | | | | cpus | |tasks| + + +More details follow:: + + Suspend call path + ----------------- + + Write 'mem' to + /sys/power/state + sysfs file + | + v + Acquire system_transition_mutex lock + | + v + Send PM_SUSPEND_PREPARE + notifications + | + v + Freeze tasks + | + | + v + disable_nonboot_cpus() + /* start */ + | + v + Acquire cpu_add_remove_lock + | + v + Iterate over CURRENTLY + online CPUs + | + | + | ---------- + v | L + ======> _cpu_down() | + | [This takes cpuhotplug.lock | + Common | before taking down the CPU | + code | and releases it when done] | O + | While it is at it, notifications | + | are sent when notable events occur, | + ======> by running all registered callbacks. | + | | O + | | + | | + v | + Note down these cpus in | P + frozen_cpus mask ---------- + | + v + Disable regular cpu hotplug + by increasing cpu_hotplug_disabled + | + v + Release cpu_add_remove_lock + | + v + /* disable_nonboot_cpus() complete */ + | + v + Do suspend + + + +Resuming back is likewise, with the counterparts being (in the order of +execution during resume): + +* enable_nonboot_cpus() which involves:: + + | Acquire cpu_add_remove_lock + | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug + | Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop] + | Release cpu_add_remove_lock + v + +* thaw tasks +* send PM_POST_SUSPEND notifications +* Release system_transition_mutex lock. + + +It is to be noted here that the system_transition_mutex lock is acquired at the very +beginning, when we are just starting out to suspend, and then released only +after the entire cycle is complete (i.e., suspend + resume). + +:: + + + + Regular CPU hotplug call path + ----------------------------- + + Write 0 (or 1) to + /sys/devices/system/cpu/cpu*/online + sysfs file + | + | + v + cpu_down() + | + v + Acquire cpu_add_remove_lock + | + v + If cpu_hotplug_disabled > 0 + return gracefully + | + | + v + ======> _cpu_down() + | [This takes cpuhotplug.lock + Common | before taking down the CPU + code | and releases it when done] + | While it is at it, notifications + | are sent when notable events occur, + ======> by running all registered callbacks. + | + | + v + Release cpu_add_remove_lock + [That's it!, for + regular CPU hotplug] + + + +So, as can be seen from the two diagrams (the parts marked as "Common code"), +regular CPU hotplug and the suspend code path converge at the _cpu_down() and +_cpu_up() functions. They differ in the arguments passed to these functions, +in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen' +argument. But during suspend, since the tasks are already frozen by the time +the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called +with the 'tasks_frozen' argument set to 1. +[See below for some known issues regarding this.] + + +Important files and functions/entry points: +------------------------------------------- + +- kernel/power/process.c : freeze_processes(), thaw_processes() +- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish() +- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus() + + + +II. What are the issues involved in CPU hotplug? +------------------------------------------------ + +There are some interesting situations involving CPU hotplug and microcode +update on the CPUs, as discussed below: + +[Please bear in mind that the kernel requests the microcode images from +userspace, using the request_firmware() function defined in +drivers/base/firmware_loader/main.c] + + +a. When all the CPUs are identical: + + This is the most common situation and it is quite straightforward: we want + to apply the same microcode revision to each of the CPUs. + To give an example of x86, the collect_cpu_info() function defined in + arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU + and thereby in applying the correct microcode revision to it. + But note that the kernel does not maintain a common microcode image for the + all CPUs, in order to handle case 'b' described below. + + +b. When some of the CPUs are different than the rest: + + In this case since we probably need to apply different microcode revisions + to different CPUs, the kernel maintains a copy of the correct microcode + image for each CPU (after appropriate CPU type/model discovery using + functions such as collect_cpu_info()). + + +c. When a CPU is physically hot-unplugged and a new (and possibly different + type of) CPU is hot-plugged into the system: + + In the current design of the kernel, whenever a CPU is taken offline during + a regular CPU hotplug operation, upon receiving the CPU_DEAD notification + (which is sent by the CPU hotplug code), the microcode update driver's + callback for that event reacts by freeing the kernel's copy of the + microcode image for that CPU. + + Hence, when a new CPU is brought online, since the kernel finds that it + doesn't have the microcode image, it does the CPU type/model discovery + afresh and then requests the userspace for the appropriate microcode image + for that CPU, which is subsequently applied. + + For example, in x86, the mc_cpu_callback() function (which is the microcode + update driver's callback registered for CPU hotplug events) calls + microcode_update_cpu() which would call microcode_init_cpu() in this case, + instead of microcode_resume_cpu() when it finds that the kernel doesn't + have a valid microcode image. This ensures that the CPU type/model + discovery is performed and the right microcode is applied to the CPU after + getting it from userspace. + + +d. Handling microcode update during suspend/hibernate: + + Strictly speaking, during a CPU hotplug operation which does not involve + physically removing or inserting CPUs, the CPUs are not actually powered + off during a CPU offline. They are just put to the lowest C-states possible. + Hence, in such a case, it is not really necessary to re-apply microcode + when the CPUs are brought back online, since they wouldn't have lost the + image during the CPU offline operation. + + This is the usual scenario encountered during a resume after a suspend. + However, in the case of hibernation, since all the CPUs are completely + powered off, during restore it becomes necessary to apply the microcode + images to all the CPUs. + + [Note that we don't expect someone to physically pull out nodes and insert + nodes with a different type of CPUs in-between a suspend-resume or a + hibernate/restore cycle.] + + In the current design of the kernel however, during a CPU offline operation + as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set), + the existing copy of microcode image in the kernel is not freed up. + And during the CPU online operations (during resume/restore), since the + kernel finds that it already has copies of the microcode images for all the + CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU + type/model and the need for validating whether the microcode revisions are + right for the CPUs or not (due to the above assumption that physical CPU + hotplug will not be done in-between suspend/resume or hibernate/restore + cycles). + + +III. Known problems +=================== + +Are there any known problems when regular CPU hotplug and suspend race +with each other? + +Yes, they are listed below: + +1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to + the _cpu_down() and _cpu_up() functions is *always* 0. + This might not reflect the true current state of the system, since the + tasks could have been frozen by an out-of-band event such as a suspend + operation in progress. Hence, the cpuhp_tasks_frozen variable will not + reflect the frozen state and the CPU hotplug callbacks which evaluate + that variable might execute the wrong code path. + +2. If a regular CPU hotplug stress test happens to race with the freezer due + to a suspend operation in progress at the same time, then we could hit the + situation described below: + + * A regular cpu online operation continues its journey from userspace + into the kernel, since the freezing has not yet begun. + * Then freezer gets to work and freezes userspace. + * If cpu online has not yet completed the microcode update stuff by now, + it will now start waiting on the frozen userspace in the + TASK_UNINTERRUPTIBLE state, in order to get the microcode image. + * Now the freezer continues and tries to freeze the remaining tasks. But + due to this wait mentioned above, the freezer won't be able to freeze + the cpu online hotplug task and hence freezing of tasks fails. + + As a result of this task freezing failure, the suspend operation gets + aborted. diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt deleted file mode 100644 index a8751b8df10e..000000000000 --- a/Documentation/power/suspend-and-cpuhotplug.txt +++ /dev/null @@ -1,274 +0,0 @@ -Interaction of Suspend code (S3) with the CPU hotplug infrastructure - - (C) 2011 - 2014 Srivatsa S. Bhat - - -I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM - infrastructure uses it internally? And where do they share common code? - -Well, a picture is worth a thousand words... So ASCII art follows :-) - -[This depicts the current design in the kernel, and focusses only on the -interactions involving the freezer and CPU hotplug and also tries to explain -the locking involved. It outlines the notifications involved as well. -But please note that here, only the call paths are illustrated, with the aim -of describing where they take different paths and where they share code. -What happens when regular CPU hotplug and Suspend-to-RAM race with each other -is not depicted here.] - -On a high level, the suspend-resume cycle goes like this: - -|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw | -|tasks | | cpus | | | | cpus | |tasks| - - -More details follow: - - Suspend call path - ----------------- - - Write 'mem' to - /sys/power/state - sysfs file - | - v - Acquire system_transition_mutex lock - | - v - Send PM_SUSPEND_PREPARE - notifications - | - v - Freeze tasks - | - | - v - disable_nonboot_cpus() - /* start */ - | - v - Acquire cpu_add_remove_lock - | - v - Iterate over CURRENTLY - online CPUs - | - | - | ---------- - v | L - ======> _cpu_down() | - | [This takes cpuhotplug.lock | - Common | before taking down the CPU | - code | and releases it when done] | O - | While it is at it, notifications | - | are sent when notable events occur, | - ======> by running all registered callbacks. | - | | O - | | - | | - v | - Note down these cpus in | P - frozen_cpus mask ---------- - | - v - Disable regular cpu hotplug - by increasing cpu_hotplug_disabled - | - v - Release cpu_add_remove_lock - | - v - /* disable_nonboot_cpus() complete */ - | - v - Do suspend - - - -Resuming back is likewise, with the counterparts being (in the order of -execution during resume): -* enable_nonboot_cpus() which involves: - | Acquire cpu_add_remove_lock - | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug - | Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop] - | Release cpu_add_remove_lock - v - -* thaw tasks -* send PM_POST_SUSPEND notifications -* Release system_transition_mutex lock. - - -It is to be noted here that the system_transition_mutex lock is acquired at the very -beginning, when we are just starting out to suspend, and then released only -after the entire cycle is complete (i.e., suspend + resume). - - - - Regular CPU hotplug call path - ----------------------------- - - Write 0 (or 1) to - /sys/devices/system/cpu/cpu*/online - sysfs file - | - | - v - cpu_down() - | - v - Acquire cpu_add_remove_lock - | - v - If cpu_hotplug_disabled > 0 - return gracefully - | - | - v - ======> _cpu_down() - | [This takes cpuhotplug.lock - Common | before taking down the CPU - code | and releases it when done] - | While it is at it, notifications - | are sent when notable events occur, - ======> by running all registered callbacks. - | - | - v - Release cpu_add_remove_lock - [That's it!, for - regular CPU hotplug] - - - -So, as can be seen from the two diagrams (the parts marked as "Common code"), -regular CPU hotplug and the suspend code path converge at the _cpu_down() and -_cpu_up() functions. They differ in the arguments passed to these functions, -in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen' -argument. But during suspend, since the tasks are already frozen by the time -the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called -with the 'tasks_frozen' argument set to 1. -[See below for some known issues regarding this.] - - -Important files and functions/entry points: ------------------------------------------- - -kernel/power/process.c : freeze_processes(), thaw_processes() -kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish() -kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus() - - - -II. What are the issues involved in CPU hotplug? - ------------------------------------------- - -There are some interesting situations involving CPU hotplug and microcode -update on the CPUs, as discussed below: - -[Please bear in mind that the kernel requests the microcode images from -userspace, using the request_firmware() function defined in -drivers/base/firmware_loader/main.c] - - -a. When all the CPUs are identical: - - This is the most common situation and it is quite straightforward: we want - to apply the same microcode revision to each of the CPUs. - To give an example of x86, the collect_cpu_info() function defined in - arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU - and thereby in applying the correct microcode revision to it. - But note that the kernel does not maintain a common microcode image for the - all CPUs, in order to handle case 'b' described below. - - -b. When some of the CPUs are different than the rest: - - In this case since we probably need to apply different microcode revisions - to different CPUs, the kernel maintains a copy of the correct microcode - image for each CPU (after appropriate CPU type/model discovery using - functions such as collect_cpu_info()). - - -c. When a CPU is physically hot-unplugged and a new (and possibly different - type of) CPU is hot-plugged into the system: - - In the current design of the kernel, whenever a CPU is taken offline during - a regular CPU hotplug operation, upon receiving the CPU_DEAD notification - (which is sent by the CPU hotplug code), the microcode update driver's - callback for that event reacts by freeing the kernel's copy of the - microcode image for that CPU. - - Hence, when a new CPU is brought online, since the kernel finds that it - doesn't have the microcode image, it does the CPU type/model discovery - afresh and then requests the userspace for the appropriate microcode image - for that CPU, which is subsequently applied. - - For example, in x86, the mc_cpu_callback() function (which is the microcode - update driver's callback registered for CPU hotplug events) calls - microcode_update_cpu() which would call microcode_init_cpu() in this case, - instead of microcode_resume_cpu() when it finds that the kernel doesn't - have a valid microcode image. This ensures that the CPU type/model - discovery is performed and the right microcode is applied to the CPU after - getting it from userspace. - - -d. Handling microcode update during suspend/hibernate: - - Strictly speaking, during a CPU hotplug operation which does not involve - physically removing or inserting CPUs, the CPUs are not actually powered - off during a CPU offline. They are just put to the lowest C-states possible. - Hence, in such a case, it is not really necessary to re-apply microcode - when the CPUs are brought back online, since they wouldn't have lost the - image during the CPU offline operation. - - This is the usual scenario encountered during a resume after a suspend. - However, in the case of hibernation, since all the CPUs are completely - powered off, during restore it becomes necessary to apply the microcode - images to all the CPUs. - - [Note that we don't expect someone to physically pull out nodes and insert - nodes with a different type of CPUs in-between a suspend-resume or a - hibernate/restore cycle.] - - In the current design of the kernel however, during a CPU offline operation - as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set), - the existing copy of microcode image in the kernel is not freed up. - And during the CPU online operations (during resume/restore), since the - kernel finds that it already has copies of the microcode images for all the - CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU - type/model and the need for validating whether the microcode revisions are - right for the CPUs or not (due to the above assumption that physical CPU - hotplug will not be done in-between suspend/resume or hibernate/restore - cycles). - - -III. Are there any known problems when regular CPU hotplug and suspend race - with each other? - -Yes, they are listed below: - -1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to - the _cpu_down() and _cpu_up() functions is *always* 0. - This might not reflect the true current state of the system, since the - tasks could have been frozen by an out-of-band event such as a suspend - operation in progress. Hence, the cpuhp_tasks_frozen variable will not - reflect the frozen state and the CPU hotplug callbacks which evaluate - that variable might execute the wrong code path. - -2. If a regular CPU hotplug stress test happens to race with the freezer due - to a suspend operation in progress at the same time, then we could hit the - situation described below: - - * A regular cpu online operation continues its journey from userspace - into the kernel, since the freezing has not yet begun. - * Then freezer gets to work and freezes userspace. - * If cpu online has not yet completed the microcode update stuff by now, - it will now start waiting on the frozen userspace in the - TASK_UNINTERRUPTIBLE state, in order to get the microcode image. - * Now the freezer continues and tries to freeze the remaining tasks. But - due to this wait mentioned above, the freezer won't be able to freeze - the cpu online hotplug task and hence freezing of tasks fails. - - As a result of this task freezing failure, the suspend operation gets - aborted. diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst new file mode 100644 index 000000000000..4cda6617709a --- /dev/null +++ b/Documentation/power/suspend-and-interrupts.rst @@ -0,0 +1,137 @@ +==================================== +System Suspend and Device Interrupts +==================================== + +Copyright (C) 2014 Intel Corp. +Author: Rafael J. Wysocki + + +Suspending and Resuming Device IRQs +----------------------------------- + +Device interrupt request lines (IRQs) are generally disabled during system +suspend after the "late" phase of suspending devices (that is, after all of the +->prepare, ->suspend and ->suspend_late callbacks have been executed for all +devices). That is done by suspend_device_irqs(). + +The rationale for doing so is that after the "late" phase of device suspend +there is no legitimate reason why any interrupts from suspended devices should +trigger and if any devices have not been suspended properly yet, it is better to +block interrupts from them anyway. Also, in the past we had problems with +interrupt handlers for shared IRQs that device drivers implementing them were +not prepared for interrupts triggering after their devices had been suspended. +In some cases they would attempt to access, for example, memory address spaces +of suspended devices and cause unpredictable behavior to ensue as a result. +Unfortunately, such problems are very difficult to debug and the introduction +of suspend_device_irqs(), along with the "noirq" phase of device suspend and +resume, was the only practical way to mitigate them. + +Device IRQs are re-enabled during system resume, right before the "early" phase +of resuming devices (that is, before starting to execute ->resume_early +callbacks for devices). The function doing that is resume_device_irqs(). + + +The IRQF_NO_SUSPEND Flag +------------------------ + +There are interrupts that can legitimately trigger during the entire system +suspend-resume cycle, including the "noirq" phases of suspending and resuming +devices as well as during the time when nonboot CPUs are taken offline and +brought back online. That applies to timer interrupts in the first place, +but also to IPIs and to some other special-purpose interrupts. + +The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when +requesting a special-purpose interrupt. It causes suspend_device_irqs() to +leave the corresponding IRQ enabled so as to allow the interrupt to work as +expected during the suspend-resume cycle, but does not guarantee that the +interrupt will wake the system from a suspended state -- for such cases it is +necessary to use enable_irq_wake(). + +Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one +user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed +for it will be executed as usual after suspend_device_irqs(), even if the +IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of +the IRQ's users. For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the +same time should be avoided. + + +System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake() +------------------------------------------------------------------ + +System wakeup interrupts generally need to be configured to wake up the system +from sleep states, especially if they are used for different purposes (e.g. as +I/O interrupts) in the working state. + +That may involve turning on a special signal handling logic within the platform +(such as an SoC) so that signals from a given line are routed in a different way +during system sleep so as to trigger a system wakeup when needed. For example, +the platform may include a dedicated interrupt controller used specifically for +handling system wakeup events. Then, if a given interrupt line is supposed to +wake up the system from sleep sates, the corresponding input of that interrupt +controller needs to be enabled to receive signals from the line in question. +After wakeup, it generally is better to disable that input to prevent the +dedicated controller from triggering interrupts unnecessarily. + +The IRQ subsystem provides two helper functions to be used by device drivers for +those purposes. Namely, enable_irq_wake() turns on the platform's logic for +handling the given IRQ as a system wakeup interrupt line and disable_irq_wake() +turns that logic off. + +Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ +in a special way. Namely, the IRQ remains enabled, by on the first interrupt +it will be disabled, marked as pending and "suspended" so that it will be +re-enabled by resume_device_irqs() during the subsequent system resume. Also +the PM core is notified about the event which causes the system suspend in +progress to be aborted (that doesn't have to happen immediately, but at one +of the points where the suspend thread looks for pending wakeup events). + +This way every interrupt from a wakeup interrupt source will either cause the +system suspend currently in progress to be aborted or wake up the system if +already suspended. However, after suspend_device_irqs() interrupt handlers are +not executed for system wakeup IRQs. They are only executed for IRQF_NO_SUSPEND +IRQs at that time, but those IRQs should not be configured for system wakeup +using enable_irq_wake(). + + +Interrupts and Suspend-to-Idle +------------------------------ + +Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new +system sleep state that works by idling all of the processors and waiting for +interrupts right after the "noirq" phase of suspending devices. + +Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag +set will bring CPUs out of idle while in that state, but they will not cause the +IRQ subsystem to trigger a system wakeup. + +System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in +analogy with what they do in the full system suspend case. The only difference +is that the wakeup from suspend-to-idle is signaled using the usual working +state interrupt delivery mechanisms and doesn't require the platform to use +any special interrupt handling logic for it to work. + + +IRQF_NO_SUSPEND and enable_irq_wake() +------------------------------------- + +There are very few valid reasons to use both enable_irq_wake() and the +IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the +same device. + +First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND +interrupts (interrupt handlers are invoked after suspend_device_irqs()) are +directly at odds with the rules for handling system wakeup interrupts (interrupt +handlers are not invoked after suspend_device_irqs()). + +Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not +to individual interrupt handlers, so sharing an IRQ between a system wakeup +interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally +make sense. + +In rare cases an IRQ can be shared between a wakeup device driver and an +IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver +must be able to discern spurious IRQs from genuine wakeup events (signalling +the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to +ensure that the IRQ will function as a wakeup source, and must request the IRQ +with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If +these requirements are not met, it is not valid to use IRQF_COND_SUSPEND. diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt deleted file mode 100644 index 8afb29a8604a..000000000000 --- a/Documentation/power/suspend-and-interrupts.txt +++ /dev/null @@ -1,135 +0,0 @@ -System Suspend and Device Interrupts - -Copyright (C) 2014 Intel Corp. -Author: Rafael J. Wysocki - - -Suspending and Resuming Device IRQs ------------------------------------ - -Device interrupt request lines (IRQs) are generally disabled during system -suspend after the "late" phase of suspending devices (that is, after all of the -->prepare, ->suspend and ->suspend_late callbacks have been executed for all -devices). That is done by suspend_device_irqs(). - -The rationale for doing so is that after the "late" phase of device suspend -there is no legitimate reason why any interrupts from suspended devices should -trigger and if any devices have not been suspended properly yet, it is better to -block interrupts from them anyway. Also, in the past we had problems with -interrupt handlers for shared IRQs that device drivers implementing them were -not prepared for interrupts triggering after their devices had been suspended. -In some cases they would attempt to access, for example, memory address spaces -of suspended devices and cause unpredictable behavior to ensue as a result. -Unfortunately, such problems are very difficult to debug and the introduction -of suspend_device_irqs(), along with the "noirq" phase of device suspend and -resume, was the only practical way to mitigate them. - -Device IRQs are re-enabled during system resume, right before the "early" phase -of resuming devices (that is, before starting to execute ->resume_early -callbacks for devices). The function doing that is resume_device_irqs(). - - -The IRQF_NO_SUSPEND Flag ------------------------- - -There are interrupts that can legitimately trigger during the entire system -suspend-resume cycle, including the "noirq" phases of suspending and resuming -devices as well as during the time when nonboot CPUs are taken offline and -brought back online. That applies to timer interrupts in the first place, -but also to IPIs and to some other special-purpose interrupts. - -The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when -requesting a special-purpose interrupt. It causes suspend_device_irqs() to -leave the corresponding IRQ enabled so as to allow the interrupt to work as -expected during the suspend-resume cycle, but does not guarantee that the -interrupt will wake the system from a suspended state -- for such cases it is -necessary to use enable_irq_wake(). - -Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one -user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed -for it will be executed as usual after suspend_device_irqs(), even if the -IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of -the IRQ's users. For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the -same time should be avoided. - - -System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake() ------------------------------------------------------------------- - -System wakeup interrupts generally need to be configured to wake up the system -from sleep states, especially if they are used for different purposes (e.g. as -I/O interrupts) in the working state. - -That may involve turning on a special signal handling logic within the platform -(such as an SoC) so that signals from a given line are routed in a different way -during system sleep so as to trigger a system wakeup when needed. For example, -the platform may include a dedicated interrupt controller used specifically for -handling system wakeup events. Then, if a given interrupt line is supposed to -wake up the system from sleep sates, the corresponding input of that interrupt -controller needs to be enabled to receive signals from the line in question. -After wakeup, it generally is better to disable that input to prevent the -dedicated controller from triggering interrupts unnecessarily. - -The IRQ subsystem provides two helper functions to be used by device drivers for -those purposes. Namely, enable_irq_wake() turns on the platform's logic for -handling the given IRQ as a system wakeup interrupt line and disable_irq_wake() -turns that logic off. - -Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ -in a special way. Namely, the IRQ remains enabled, by on the first interrupt -it will be disabled, marked as pending and "suspended" so that it will be -re-enabled by resume_device_irqs() during the subsequent system resume. Also -the PM core is notified about the event which causes the system suspend in -progress to be aborted (that doesn't have to happen immediately, but at one -of the points where the suspend thread looks for pending wakeup events). - -This way every interrupt from a wakeup interrupt source will either cause the -system suspend currently in progress to be aborted or wake up the system if -already suspended. However, after suspend_device_irqs() interrupt handlers are -not executed for system wakeup IRQs. They are only executed for IRQF_NO_SUSPEND -IRQs at that time, but those IRQs should not be configured for system wakeup -using enable_irq_wake(). - - -Interrupts and Suspend-to-Idle ------------------------------- - -Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new -system sleep state that works by idling all of the processors and waiting for -interrupts right after the "noirq" phase of suspending devices. - -Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag -set will bring CPUs out of idle while in that state, but they will not cause the -IRQ subsystem to trigger a system wakeup. - -System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in -analogy with what they do in the full system suspend case. The only difference -is that the wakeup from suspend-to-idle is signaled using the usual working -state interrupt delivery mechanisms and doesn't require the platform to use -any special interrupt handling logic for it to work. - - -IRQF_NO_SUSPEND and enable_irq_wake() -------------------------------------- - -There are very few valid reasons to use both enable_irq_wake() and the -IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the -same device. - -First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND -interrupts (interrupt handlers are invoked after suspend_device_irqs()) are -directly at odds with the rules for handling system wakeup interrupts (interrupt -handlers are not invoked after suspend_device_irqs()). - -Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not -to individual interrupt handlers, so sharing an IRQ between a system wakeup -interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally -make sense. - -In rare cases an IRQ can be shared between a wakeup device driver and an -IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver -must be able to discern spurious IRQs from genuine wakeup events (signalling -the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to -ensure that the IRQ will function as a wakeup source, and must request the IRQ -with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If -these requirements are not met, it is not valid to use IRQF_COND_SUSPEND. diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst new file mode 100644 index 000000000000..a33a2919dbe4 --- /dev/null +++ b/Documentation/power/swsusp-and-swap-files.rst @@ -0,0 +1,63 @@ +=============================================== +Using swap files with software suspend (swsusp) +=============================================== + + (C) 2006 Rafael J. Wysocki + +The Linux kernel handles swap files almost in the same way as it handles swap +partitions and there are only two differences between these two types of swap +areas: +(1) swap files need not be contiguous, +(2) the header of a swap file is not in the first block of the partition that +holds it. From the swsusp's point of view (1) is not a problem, because it is +already taken care of by the swap-handling code, but (2) has to be taken into +consideration. + +In principle the location of a swap file's header may be determined with the +help of appropriate filesystem driver. Unfortunately, however, it requires the +filesystem holding the swap file to be mounted, and if this filesystem is +journaled, it cannot be mounted during resume from disk. For this reason to +identify a swap file swsusp uses the name of the partition that holds the file +and the offset from the beginning of the partition at which the swap file's +header is located. For convenience, this offset is expressed in +units. + +In order to use a swap file with swsusp, you need to: + +1) Create the swap file and make it active, eg.:: + + # dd if=/dev/zero of= bs=1024 count= + # mkswap + # swapon + +2) Use an application that will bmap the swap file with the help of the +FIBMAP ioctl and determine the location of the file's swap header, as the +offset, in units, from the beginning of the partition which +holds the swap file. + +3) Add the following parameters to the kernel command line:: + + resume= resume_offset= + +where is the partition on which the swap file is located +and is the offset of the swap header determined by the +application in 2) (of course, this step may be carried out automatically +by the same application that determines the swap file's header offset using the +FIBMAP ioctl) + +OR + +Use a userland suspend application that will set the partition and offset +with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in +Documentation/power/userland-swsusp.rst (this is the only method to suspend +to a swap file allowing the resume to be initiated from an initrd or initramfs +image). + +Now, swsusp will use the swap file in the same way in which it would use a swap +partition. In particular, the swap file has to be active (ie. be present in +/proc/swaps) so that it can be used for suspending. + +Note that if the swap file used for suspending is deleted and recreated, +the location of its header need not be the same as before. Thus every time +this happens the value of the "resume_offset=" kernel command line parameter +has to be updated. diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt deleted file mode 100644 index f281886de490..000000000000 --- a/Documentation/power/swsusp-and-swap-files.txt +++ /dev/null @@ -1,60 +0,0 @@ -Using swap files with software suspend (swsusp) - (C) 2006 Rafael J. Wysocki - -The Linux kernel handles swap files almost in the same way as it handles swap -partitions and there are only two differences between these two types of swap -areas: -(1) swap files need not be contiguous, -(2) the header of a swap file is not in the first block of the partition that -holds it. From the swsusp's point of view (1) is not a problem, because it is -already taken care of by the swap-handling code, but (2) has to be taken into -consideration. - -In principle the location of a swap file's header may be determined with the -help of appropriate filesystem driver. Unfortunately, however, it requires the -filesystem holding the swap file to be mounted, and if this filesystem is -journaled, it cannot be mounted during resume from disk. For this reason to -identify a swap file swsusp uses the name of the partition that holds the file -and the offset from the beginning of the partition at which the swap file's -header is located. For convenience, this offset is expressed in -units. - -In order to use a swap file with swsusp, you need to: - -1) Create the swap file and make it active, eg. - -# dd if=/dev/zero of= bs=1024 count= -# mkswap -# swapon - -2) Use an application that will bmap the swap file with the help of the -FIBMAP ioctl and determine the location of the file's swap header, as the -offset, in units, from the beginning of the partition which -holds the swap file. - -3) Add the following parameters to the kernel command line: - -resume= resume_offset= - -where is the partition on which the swap file is located -and is the offset of the swap header determined by the -application in 2) (of course, this step may be carried out automatically -by the same application that determines the swap file's header offset using the -FIBMAP ioctl) - -OR - -Use a userland suspend application that will set the partition and offset -with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in -Documentation/power/userland-swsusp.txt (this is the only method to suspend -to a swap file allowing the resume to be initiated from an initrd or initramfs -image). - -Now, swsusp will use the swap file in the same way in which it would use a swap -partition. In particular, the swap file has to be active (ie. be present in -/proc/swaps) so that it can be used for suspending. - -Note that if the swap file used for suspending is deleted and recreated, -the location of its header need not be the same as before. Thus every time -this happens the value of the "resume_offset=" kernel command line parameter -has to be updated. diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst new file mode 100644 index 000000000000..426df59172cd --- /dev/null +++ b/Documentation/power/swsusp-dmcrypt.rst @@ -0,0 +1,140 @@ +======================================= +How to use dm-crypt and swsusp together +======================================= + +Author: Andreas Steinmetz + + + +Some prerequisites: +You know how dm-crypt works. If not, visit the following web page: +http://www.saout.de/misc/dm-crypt/ +You have read Documentation/power/swsusp.rst and understand it. +You did read Documentation/admin-guide/initrd.rst and know how an initrd works. +You know how to create or how to modify an initrd. + +Now your system is properly set up, your disk is encrypted except for +the swap device(s) and the boot partition which may contain a mini +system for crypto setup and/or rescue purposes. You may even have +an initrd that does your current crypto setup already. + +At this point you want to encrypt your swap, too. Still you want to +be able to suspend using swsusp. This, however, means that you +have to be able to either enter a passphrase or that you read +the key(s) from an external device like a pcmcia flash disk +or an usb stick prior to resume. So you need an initrd, that sets +up dm-crypt and then asks swsusp to resume from the encrypted +swap device. + +The most important thing is that you set up dm-crypt in such +a way that the swap device you suspend to/resume from has +always the same major/minor within the initrd as well as +within your running system. The easiest way to achieve this is +to always set up this swap device first with dmsetup, so that +it will always look like the following:: + + brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0 + +Now set up your kernel to use /dev/mapper/swap0 as the default +resume partition, so your kernel .config contains:: + + CONFIG_PM_STD_PARTITION="/dev/mapper/swap0" + +Prepare your boot loader to use the initrd you will create or +modify. For lilo the simplest setup looks like the following +lines:: + + image=/boot/vmlinuz + initrd=/boot/initrd.gz + label=linux + append="root=/dev/ram0 init=/linuxrc rw" + +Finally you need to create or modify your initrd. Lets assume +you create an initrd that reads the required dm-crypt setup +from a pcmcia flash disk card. The card is formatted with an ext2 +fs which resides on /dev/hde1 when the card is inserted. The +card contains at least the encrypted swap setup in a file +named "swapkey". /etc/fstab of your initrd contains something +like the following:: + + /dev/hda1 /mnt ext3 ro 0 0 + none /proc proc defaults,noatime,nodiratime 0 0 + none /sys sysfs defaults,noatime,nodiratime 0 0 + +/dev/hda1 contains an unencrypted mini system that sets up all +of your crypto devices, again by reading the setup from the +pcmcia flash disk. What follows now is a /linuxrc for your +initrd that allows you to resume from encrypted swap and that +continues boot with your mini system on /dev/hda1 if resume +does not happen:: + + #!/bin/sh + PATH=/sbin:/bin:/usr/sbin:/usr/bin + mount /proc + mount /sys + mapped=0 + noresume=`grep -c noresume /proc/cmdline` + if [ "$*" != "" ] + then + noresume=1 + fi + dmesg -n 1 + /sbin/cardmgr -q + for i in 1 2 3 4 5 6 7 8 9 0 + do + if [ -f /proc/ide/hde/media ] + then + usleep 500000 + mount -t ext2 -o ro /dev/hde1 /mnt + if [ -f /mnt/swapkey ] + then + dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1 + fi + umount /mnt + break + fi + usleep 500000 + done + killproc /sbin/cardmgr + dmesg -n 6 + if [ $mapped = 1 ] + then + if [ $noresume != 0 ] + then + mkswap /dev/mapper/swap0 > /dev/null 2>&1 + fi + echo 254:0 > /sys/power/resume + dmsetup remove swap0 + fi + umount /sys + mount /mnt + umount /proc + cd /mnt + pivot_root . mnt + mount /proc + umount -l /mnt + umount /proc + exec chroot . /sbin/init $* < dev/console > dev/console 2>&1 + +Please don't mind the weird loop above, busybox's msh doesn't know +the let statement. Now, what is happening in the script? +First we have to decide if we want to try to resume, or not. +We will not resume if booting with "noresume" or any parameters +for init like "single" or "emergency" as boot parameters. + +Then we need to set up dmcrypt with the setup data from the +pcmcia flash disk. If this succeeds we need to reset the swap +device if we don't want to resume. The line "echo 254:0 > /sys/power/resume" +then attempts to resume from the first device mapper device. +Note that it is important to set the device in /sys/power/resume, +regardless if resuming or not, otherwise later suspend will fail. +If resume starts, script execution terminates here. + +Otherwise we just remove the encrypted swap device and leave it to the +mini system on /dev/hda1 to set the whole crypto up (it is up to +you to modify this to your taste). + +What then follows is the well known process to change the root +file system and continue booting from there. I prefer to unmount +the initrd prior to continue booting but it is up to you to modify +this. diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt deleted file mode 100644 index b802fbfd95ef..000000000000 --- a/Documentation/power/swsusp-dmcrypt.txt +++ /dev/null @@ -1,138 +0,0 @@ -Author: Andreas Steinmetz - - -How to use dm-crypt and swsusp together: -======================================== - -Some prerequisites: -You know how dm-crypt works. If not, visit the following web page: -http://www.saout.de/misc/dm-crypt/ -You have read Documentation/power/swsusp.txt and understand it. -You did read Documentation/admin-guide/initrd.rst and know how an initrd works. -You know how to create or how to modify an initrd. - -Now your system is properly set up, your disk is encrypted except for -the swap device(s) and the boot partition which may contain a mini -system for crypto setup and/or rescue purposes. You may even have -an initrd that does your current crypto setup already. - -At this point you want to encrypt your swap, too. Still you want to -be able to suspend using swsusp. This, however, means that you -have to be able to either enter a passphrase or that you read -the key(s) from an external device like a pcmcia flash disk -or an usb stick prior to resume. So you need an initrd, that sets -up dm-crypt and then asks swsusp to resume from the encrypted -swap device. - -The most important thing is that you set up dm-crypt in such -a way that the swap device you suspend to/resume from has -always the same major/minor within the initrd as well as -within your running system. The easiest way to achieve this is -to always set up this swap device first with dmsetup, so that -it will always look like the following: - -brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0 - -Now set up your kernel to use /dev/mapper/swap0 as the default -resume partition, so your kernel .config contains: - -CONFIG_PM_STD_PARTITION="/dev/mapper/swap0" - -Prepare your boot loader to use the initrd you will create or -modify. For lilo the simplest setup looks like the following -lines: - -image=/boot/vmlinuz -initrd=/boot/initrd.gz -label=linux -append="root=/dev/ram0 init=/linuxrc rw" - -Finally you need to create or modify your initrd. Lets assume -you create an initrd that reads the required dm-crypt setup -from a pcmcia flash disk card. The card is formatted with an ext2 -fs which resides on /dev/hde1 when the card is inserted. The -card contains at least the encrypted swap setup in a file -named "swapkey". /etc/fstab of your initrd contains something -like the following: - -/dev/hda1 /mnt ext3 ro 0 0 -none /proc proc defaults,noatime,nodiratime 0 0 -none /sys sysfs defaults,noatime,nodiratime 0 0 - -/dev/hda1 contains an unencrypted mini system that sets up all -of your crypto devices, again by reading the setup from the -pcmcia flash disk. What follows now is a /linuxrc for your -initrd that allows you to resume from encrypted swap and that -continues boot with your mini system on /dev/hda1 if resume -does not happen: - -#!/bin/sh -PATH=/sbin:/bin:/usr/sbin:/usr/bin -mount /proc -mount /sys -mapped=0 -noresume=`grep -c noresume /proc/cmdline` -if [ "$*" != "" ] -then - noresume=1 -fi -dmesg -n 1 -/sbin/cardmgr -q -for i in 1 2 3 4 5 6 7 8 9 0 -do - if [ -f /proc/ide/hde/media ] - then - usleep 500000 - mount -t ext2 -o ro /dev/hde1 /mnt - if [ -f /mnt/swapkey ] - then - dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1 - fi - umount /mnt - break - fi - usleep 500000 -done -killproc /sbin/cardmgr -dmesg -n 6 -if [ $mapped = 1 ] -then - if [ $noresume != 0 ] - then - mkswap /dev/mapper/swap0 > /dev/null 2>&1 - fi - echo 254:0 > /sys/power/resume - dmsetup remove swap0 -fi -umount /sys -mount /mnt -umount /proc -cd /mnt -pivot_root . mnt -mount /proc -umount -l /mnt -umount /proc -exec chroot . /sbin/init $* < dev/console > dev/console 2>&1 - -Please don't mind the weird loop above, busybox's msh doesn't know -the let statement. Now, what is happening in the script? -First we have to decide if we want to try to resume, or not. -We will not resume if booting with "noresume" or any parameters -for init like "single" or "emergency" as boot parameters. - -Then we need to set up dmcrypt with the setup data from the -pcmcia flash disk. If this succeeds we need to reset the swap -device if we don't want to resume. The line "echo 254:0 > /sys/power/resume" -then attempts to resume from the first device mapper device. -Note that it is important to set the device in /sys/power/resume, -regardless if resuming or not, otherwise later suspend will fail. -If resume starts, script execution terminates here. - -Otherwise we just remove the encrypted swap device and leave it to the -mini system on /dev/hda1 to set the whole crypto up (it is up to -you to modify this to your taste). - -What then follows is the well known process to change the root -file system and continue booting from there. I prefer to unmount -the initrd prior to continue booting but it is up to you to modify -this. diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst new file mode 100644 index 000000000000..d000312f6965 --- /dev/null +++ b/Documentation/power/swsusp.rst @@ -0,0 +1,501 @@ +============ +Swap suspend +============ + +Some warnings, first. + +.. warning:: + + **BIG FAT WARNING** + + If you touch anything on disk between suspend and resume... + ...kiss your data goodbye. + + If you do resume from initrd after your filesystems are mounted... + ...bye bye root partition. + + [this is actually same case as above] + + If you have unsupported ( ) devices using DMA, you may have some + problems. If your disk driver does not support suspend... (IDE does), + it may cause some problems, too. If you change kernel command line + between suspend and resume, it may do something wrong. If you change + your hardware while system is suspended... well, it was not good idea; + but it will probably only crash. + + ( ) suspend/resume support is needed to make it safe. + + If you have any filesystems on USB devices mounted before software suspend, + they won't be accessible after resume and you may lose data, as though + you have unplugged the USB devices with mounted filesystems on them; + see the FAQ below for details. (This is not true for more traditional + power states like "standby", which normally don't turn USB off.) + +Swap partition: + You need to append resume=/dev/your_swap_partition to kernel command + line or specify it using /sys/power/resume. + +Swap file: + If using a swapfile you can also specify a resume offset using + resume_offset= on the kernel command line or specify it + in /sys/power/resume_offset. + +After preparing then you suspend by:: + + echo shutdown > /sys/power/disk; echo disk > /sys/power/state + +- If you feel ACPI works pretty well on your system, you might try:: + + echo platform > /sys/power/disk; echo disk > /sys/power/state + +- If you would like to write hibernation image to swap and then suspend + to RAM (provided your platform supports it), you can try:: + + echo suspend > /sys/power/disk; echo disk > /sys/power/state + +- If you have SATA disks, you'll need recent kernels with SATA suspend + support. For suspend and resume to work, make sure your disk drivers + are built into kernel -- not modules. [There's way to make + suspend/resume with modular disk drivers, see FAQ, but you probably + should not do that.] + +If you want to limit the suspend image size to N bytes, do:: + + echo N > /sys/power/image_size + +before suspend (it is limited to around 2/5 of available RAM by default). + +- The resume process checks for the presence of the resume device, + if found, it then checks the contents for the hibernation image signature. + If both are found, it resumes the hibernation image. + +- The resume process may be triggered in two ways: + + 1) During lateinit: If resume=/dev/your_swap_partition is specified on + the kernel command line, lateinit runs the resume process. If the + resume device has not been probed yet, the resume process fails and + bootup continues. + 2) Manually from an initrd or initramfs: May be run from + the init script by using the /sys/power/resume file. It is vital + that this be done prior to remounting any filesystems (even as + read-only) otherwise data may be corrupted. + +Article about goals and implementation of Software Suspend for Linux +==================================================================== + +Author: Gábor Kuti +Last revised: 2003-10-20 by Pavel Machek + +Idea and goals to achieve +------------------------- + +Nowadays it is common in several laptops that they have a suspend button. It +saves the state of the machine to a filesystem or to a partition and switches +to standby mode. Later resuming the machine the saved state is loaded back to +ram and the machine can continue its work. It has two real benefits. First we +save ourselves the time machine goes down and later boots up, energy costs +are real high when running from batteries. The other gain is that we don't have +to interrupt our programs so processes that are calculating something for a long +time shouldn't need to be written interruptible. + +swsusp saves the state of the machine into active swaps and then reboots or +powerdowns. You must explicitly specify the swap partition to resume from with +`resume=` kernel option. If signature is found it loads and restores saved +state. If the option `noresume` is specified as a boot parameter, it skips +the resuming. If the option `hibernate=nocompress` is specified as a boot +parameter, it saves hibernation image without compression. + +In the meantime while the system is suspended you should not add/remove any +of the hardware, write to the filesystems, etc. + +Sleep states summary +==================== + +There are three different interfaces you can use, /proc/acpi should +work like this: + +In a really perfect world:: + + echo 1 > /proc/acpi/sleep # for standby + echo 2 > /proc/acpi/sleep # for suspend to ram + echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative + echo 4 > /proc/acpi/sleep # for suspend to disk + echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system + +and perhaps:: + + echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios + +Frequently Asked Questions +========================== + +Q: + well, suspending a server is IMHO a really stupid thing, + but... (Diego Zuccato): + +A: + You bought new UPS for your server. How do you install it without + bringing machine down? Suspend to disk, rearrange power cables, + resume. + + You have your server on UPS. Power died, and UPS is indicating 30 + seconds to failure. What do you do? Suspend to disk. + + +Q: + Maybe I'm missing something, but why don't the regular I/O paths work? + +A: + We do use the regular I/O paths. However we cannot restore the data + to its original location as we load it. That would create an + inconsistent kernel state which would certainly result in an oops. + Instead, we load the image into unused memory and then atomically copy + it back to it original location. This implies, of course, a maximum + image size of half the amount of memory. + + There are two solutions to this: + + * require half of memory to be free during suspend. That way you can + read "new" data onto free spots, then cli and copy + + * assume we had special "polling" ide driver that only uses memory + between 0-640KB. That way, I'd have to make sure that 0-640KB is free + during suspending, but otherwise it would work... + + suspend2 shares this fundamental limitation, but does not include user + data and disk caches into "used memory" by saving them in + advance. That means that the limitation goes away in practice. + +Q: + Does linux support ACPI S4? + +A: + Yes. That's what echo platform > /sys/power/disk does. + +Q: + What is 'suspend2'? + +A: + suspend2 is 'Software Suspend 2', a forked implementation of + suspend-to-disk which is available as separate patches for 2.4 and 2.6 + kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB + highmem and preemption. It also has a extensible architecture that + allows for arbitrary transformations on the image (compression, + encryption) and arbitrary backends for writing the image (eg to swap + or an NFS share[Work In Progress]). Questions regarding suspend2 + should be sent to the mailing list available through the suspend2 + website, and not to the Linux Kernel Mailing List. We are working + toward merging suspend2 into the mainline kernel. + +Q: + What is the freezing of tasks and why are we using it? + +A: + The freezing of tasks is a mechanism by which user space processes and some + kernel threads are controlled during hibernation or system-wide suspend (on some + architectures). See freezing-of-tasks.txt for details. + +Q: + What is the difference between "platform" and "shutdown"? + +A: + shutdown: + save state in linux, then tell bios to powerdown + + platform: + save state in linux, then tell bios to powerdown and blink + "suspended led" + + "platform" is actually right thing to do where supported, but + "shutdown" is most reliable (except on ACPI systems). + +Q: + I do not understand why you have such strong objections to idea of + selective suspend. + +A: + Do selective suspend during runtime power management, that's okay. But + it's useless for suspend-to-disk. (And I do not see how you could use + it for suspend-to-ram, I hope you do not want that). + + Lets see, so you suggest to + + * SUSPEND all but swap device and parents + * Snapshot + * Write image to disk + * SUSPEND swap device and parents + * Powerdown + + Oh no, that does not work, if swap device or its parents uses DMA, + you've corrupted data. You'd have to do + + * SUSPEND all but swap device and parents + * FREEZE swap device and parents + * Snapshot + * UNFREEZE swap device and parents + * Write + * SUSPEND swap device and parents + + Which means that you still need that FREEZE state, and you get more + complicated code. (And I have not yet introduce details like system + devices). + +Q: + There don't seem to be any generally useful behavioral + distinctions between SUSPEND and FREEZE. + +A: + Doing SUSPEND when you are asked to do FREEZE is always correct, + but it may be unnecessarily slow. If you want your driver to stay simple, + slowness may not matter to you. It can always be fixed later. + + For devices like disk it does matter, you do not want to spindown for + FREEZE. + +Q: + After resuming, system is paging heavily, leading to very bad interactivity. + +A: + Try running:: + + cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file + do + test -f "$file" && cat "$file" > /dev/null + done + + after resume. swapoff -a; swapon -a may also be useful. + +Q: + What happens to devices during swsusp? They seem to be resumed + during system suspend? + +A: + That's correct. We need to resume them if we want to write image to + disk. Whole sequence goes like + + **Suspend part** + + running system, user asks for suspend-to-disk + + user processes are stopped + + suspend(PMSG_FREEZE): devices are frozen so that they don't interfere + with state snapshot + + state snapshot: copy of whole used memory is taken with interrupts disabled + + resume(): devices are woken up so that we can write image to swap + + write image to swap + + suspend(PMSG_SUSPEND): suspend devices so that we can power off + + turn the power off + + **Resume part** + + (is actually pretty similar) + + running system, user asks for suspend-to-disk + + user processes are stopped (in common case there are none, + but with resume-from-initrd, no one knows) + + read image from disk + + suspend(PMSG_FREEZE): devices are frozen so that they don't interfere + with image restoration + + image restoration: rewrite memory with image + + resume(): devices are woken up so that system can continue + + thaw all user processes + +Q: + What is this 'Encrypt suspend image' for? + +A: + First of all: it is not a replacement for dm-crypt encrypted swap. + It cannot protect your computer while it is suspended. Instead it does + protect from leaking sensitive data after resume from suspend. + + Think of the following: you suspend while an application is running + that keeps sensitive data in memory. The application itself prevents + the data from being swapped out. Suspend, however, must write these + data to swap to be able to resume later on. Without suspend encryption + your sensitive data are then stored in plaintext on disk. This means + that after resume your sensitive data are accessible to all + applications having direct access to the swap device which was used + for suspend. If you don't need swap after resume these data can remain + on disk virtually forever. Thus it can happen that your system gets + broken in weeks later and sensitive data which you thought were + encrypted and protected are retrieved and stolen from the swap device. + To prevent this situation you should use 'Encrypt suspend image'. + + During suspend a temporary key is created and this key is used to + encrypt the data written to disk. When, during resume, the data was + read back into memory the temporary key is destroyed which simply + means that all data written to disk during suspend are then + inaccessible so they can't be stolen later on. The only thing that + you must then take care of is that you call 'mkswap' for the swap + partition used for suspend as early as possible during regular + boot. This asserts that any temporary key from an oopsed suspend or + from a failed or aborted resume is erased from the swap device. + + As a rule of thumb use encrypted swap to protect your data while your + system is shut down or suspended. Additionally use the encrypted + suspend image to prevent sensitive data from being stolen after + resume. + +Q: + Can I suspend to a swap file? + +A: + Generally, yes, you can. However, it requires you to use the "resume=" and + "resume_offset=" kernel command line parameters, so the resume from a swap file + cannot be initiated from an initrd or initramfs image. See + swsusp-and-swap-files.txt for details. + +Q: + Is there a maximum system RAM size that is supported by swsusp? + +A: + It should work okay with highmem. + +Q: + Does swsusp (to disk) use only one swap partition or can it use + multiple swap partitions (aggregate them into one logical space)? + +A: + Only one swap partition, sorry. + +Q: + If my application(s) causes lots of memory & swap space to be used + (over half of the total system RAM), is it correct that it is likely + to be useless to try to suspend to disk while that app is running? + +A: + No, it should work okay, as long as your app does not mlock() + it. Just prepare big enough swap partition. + +Q: + What information is useful for debugging suspend-to-disk problems? + +A: + Well, last messages on the screen are always useful. If something + is broken, it is usually some kernel driver, therefore trying with as + little as possible modules loaded helps a lot. I also prefer people to + suspend from console, preferably without X running. Booting with + init=/bin/bash, then swapon and starting suspend sequence manually + usually does the trick. Then it is good idea to try with latest + vanilla kernel. + +Q: + How can distributions ship a swsusp-supporting kernel with modular + disk drivers (especially SATA)? + +A: + Well, it can be done, load the drivers, then do echo into + /sys/power/resume file from initrd. Be sure not to mount + anything, not even read-only mount, or you are going to lose your + data. + +Q: + How do I make suspend more verbose? + +A: + If you want to see any non-error kernel messages on the virtual + terminal the kernel switches to during suspend, you have to set the + kernel console loglevel to at least 4 (KERN_WARNING), for example by + doing:: + + # save the old loglevel + read LOGLEVEL DUMMY < /proc/sys/kernel/printk + # set the loglevel so we see the progress bar. + # if the level is higher than needed, we leave it alone. + if [ $LOGLEVEL -lt 5 ]; then + echo 5 > /proc/sys/kernel/printk + fi + + IMG_SZ=0 + read IMG_SZ < /sys/power/image_size + echo -n disk > /sys/power/state + RET=$? + # + # the logic here is: + # if image_size > 0 (without kernel support, IMG_SZ will be zero), + # then try again with image_size set to zero. + if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size + echo 0 > /sys/power/image_size + echo -n disk > /sys/power/state + RET=$? + fi + + # restore previous loglevel + echo $LOGLEVEL > /proc/sys/kernel/printk + exit $RET + +Q: + Is this true that if I have a mounted filesystem on a USB device and + I suspend to disk, I can lose data unless the filesystem has been mounted + with "sync"? + +A: + That's right ... if you disconnect that device, you may lose data. + In fact, even with "-o sync" you can lose data if your programs have + information in buffers they haven't written out to a disk you disconnect, + or if you disconnect before the device finished saving data you wrote. + + Software suspend normally powers down USB controllers, which is equivalent + to disconnecting all USB devices attached to your system. + + Your system might well support low-power modes for its USB controllers + while the system is asleep, maintaining the connection, using true sleep + modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the + /sys/power/state file; write "standby" or "mem".) We've not seen any + hardware that can use these modes through software suspend, although in + theory some systems might support "platform" modes that won't break the + USB connections. + + Remember that it's always a bad idea to unplug a disk drive containing a + mounted filesystem. That's true even when your system is asleep! The + safest thing is to unmount all filesystems on removable media (such USB, + Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays) + before suspending; then remount them after resuming. + + There is a work-around for this problem. For more information, see + Documentation/driver-api/usb/persist.rst. + +Q: + Can I suspend-to-disk using a swap partition under LVM? + +A: + Yes and No. You can suspend successfully, but the kernel will not be able + to resume on its own. You need an initramfs that can recognize the resume + situation, activate the logical volume containing the swap volume (but not + touch any filesystems!), and eventually call:: + + echo -n "$major:$minor" > /sys/power/resume + + where $major and $minor are the respective major and minor device numbers of + the swap volume. + + uswsusp works with LVM, too. See http://suspend.sourceforge.net/ + +Q: + I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were + compiled with the similar configuration files. Anyway I found that + suspend to disk (and resume) is much slower on 2.6.16 compared to + 2.6.15. Any idea for why that might happen or how can I speed it up? + +A: + This is because the size of the suspend image is now greater than + for 2.6.15 (by saving more data we can get more responsive system + after resume). + + There's the /sys/power/image_size knob that controls the size of the + image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as + root), the 2.6.15 behavior should be restored. If it is still too + slow, take a look at suspend.sf.net -- userland suspend is faster and + supports LZF compression to speed it up further. diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt deleted file mode 100644 index 236d1fb13640..000000000000 --- a/Documentation/power/swsusp.txt +++ /dev/null @@ -1,446 +0,0 @@ -Some warnings, first. - - * BIG FAT WARNING ********************************************************* - * - * If you touch anything on disk between suspend and resume... - * ...kiss your data goodbye. - * - * If you do resume from initrd after your filesystems are mounted... - * ...bye bye root partition. - * [this is actually same case as above] - * - * If you have unsupported (*) devices using DMA, you may have some - * problems. If your disk driver does not support suspend... (IDE does), - * it may cause some problems, too. If you change kernel command line - * between suspend and resume, it may do something wrong. If you change - * your hardware while system is suspended... well, it was not good idea; - * but it will probably only crash. - * - * (*) suspend/resume support is needed to make it safe. - * - * If you have any filesystems on USB devices mounted before software suspend, - * they won't be accessible after resume and you may lose data, as though - * you have unplugged the USB devices with mounted filesystems on them; - * see the FAQ below for details. (This is not true for more traditional - * power states like "standby", which normally don't turn USB off.) - -Swap partition: -You need to append resume=/dev/your_swap_partition to kernel command -line or specify it using /sys/power/resume. - -Swap file: -If using a swapfile you can also specify a resume offset using -resume_offset= on the kernel command line or specify it -in /sys/power/resume_offset. - -After preparing then you suspend by - -echo shutdown > /sys/power/disk; echo disk > /sys/power/state - -. If you feel ACPI works pretty well on your system, you might try - -echo platform > /sys/power/disk; echo disk > /sys/power/state - -. If you would like to write hibernation image to swap and then suspend -to RAM (provided your platform supports it), you can try - -echo suspend > /sys/power/disk; echo disk > /sys/power/state - -. If you have SATA disks, you'll need recent kernels with SATA suspend -support. For suspend and resume to work, make sure your disk drivers -are built into kernel -- not modules. [There's way to make -suspend/resume with modular disk drivers, see FAQ, but you probably -should not do that.] - -If you want to limit the suspend image size to N bytes, do - -echo N > /sys/power/image_size - -before suspend (it is limited to around 2/5 of available RAM by default). - -. The resume process checks for the presence of the resume device, -if found, it then checks the contents for the hibernation image signature. -If both are found, it resumes the hibernation image. - -. The resume process may be triggered in two ways: - 1) During lateinit: If resume=/dev/your_swap_partition is specified on - the kernel command line, lateinit runs the resume process. If the - resume device has not been probed yet, the resume process fails and - bootup continues. - 2) Manually from an initrd or initramfs: May be run from - the init script by using the /sys/power/resume file. It is vital - that this be done prior to remounting any filesystems (even as - read-only) otherwise data may be corrupted. - -Article about goals and implementation of Software Suspend for Linux -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Author: Gábor Kuti -Last revised: 2003-10-20 by Pavel Machek - -Idea and goals to achieve - -Nowadays it is common in several laptops that they have a suspend button. It -saves the state of the machine to a filesystem or to a partition and switches -to standby mode. Later resuming the machine the saved state is loaded back to -ram and the machine can continue its work. It has two real benefits. First we -save ourselves the time machine goes down and later boots up, energy costs -are real high when running from batteries. The other gain is that we don't have to -interrupt our programs so processes that are calculating something for a long -time shouldn't need to be written interruptible. - -swsusp saves the state of the machine into active swaps and then reboots or -powerdowns. You must explicitly specify the swap partition to resume from with -``resume='' kernel option. If signature is found it loads and restores saved -state. If the option ``noresume'' is specified as a boot parameter, it skips -the resuming. If the option ``hibernate=nocompress'' is specified as a boot -parameter, it saves hibernation image without compression. - -In the meantime while the system is suspended you should not add/remove any -of the hardware, write to the filesystems, etc. - -Sleep states summary -==================== - -There are three different interfaces you can use, /proc/acpi should -work like this: - -In a really perfect world: -echo 1 > /proc/acpi/sleep # for standby -echo 2 > /proc/acpi/sleep # for suspend to ram -echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative -echo 4 > /proc/acpi/sleep # for suspend to disk -echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system - -and perhaps -echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios - -Frequently Asked Questions -========================== - -Q: well, suspending a server is IMHO a really stupid thing, -but... (Diego Zuccato): - -A: You bought new UPS for your server. How do you install it without -bringing machine down? Suspend to disk, rearrange power cables, -resume. - -You have your server on UPS. Power died, and UPS is indicating 30 -seconds to failure. What do you do? Suspend to disk. - - -Q: Maybe I'm missing something, but why don't the regular I/O paths work? - -A: We do use the regular I/O paths. However we cannot restore the data -to its original location as we load it. That would create an -inconsistent kernel state which would certainly result in an oops. -Instead, we load the image into unused memory and then atomically copy -it back to it original location. This implies, of course, a maximum -image size of half the amount of memory. - -There are two solutions to this: - -* require half of memory to be free during suspend. That way you can -read "new" data onto free spots, then cli and copy - -* assume we had special "polling" ide driver that only uses memory -between 0-640KB. That way, I'd have to make sure that 0-640KB is free -during suspending, but otherwise it would work... - -suspend2 shares this fundamental limitation, but does not include user -data and disk caches into "used memory" by saving them in -advance. That means that the limitation goes away in practice. - -Q: Does linux support ACPI S4? - -A: Yes. That's what echo platform > /sys/power/disk does. - -Q: What is 'suspend2'? - -A: suspend2 is 'Software Suspend 2', a forked implementation of -suspend-to-disk which is available as separate patches for 2.4 and 2.6 -kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB -highmem and preemption. It also has a extensible architecture that -allows for arbitrary transformations on the image (compression, -encryption) and arbitrary backends for writing the image (eg to swap -or an NFS share[Work In Progress]). Questions regarding suspend2 -should be sent to the mailing list available through the suspend2 -website, and not to the Linux Kernel Mailing List. We are working -toward merging suspend2 into the mainline kernel. - -Q: What is the freezing of tasks and why are we using it? - -A: The freezing of tasks is a mechanism by which user space processes and some -kernel threads are controlled during hibernation or system-wide suspend (on some -architectures). See freezing-of-tasks.txt for details. - -Q: What is the difference between "platform" and "shutdown"? - -A: - -shutdown: save state in linux, then tell bios to powerdown - -platform: save state in linux, then tell bios to powerdown and blink - "suspended led" - -"platform" is actually right thing to do where supported, but -"shutdown" is most reliable (except on ACPI systems). - -Q: I do not understand why you have such strong objections to idea of -selective suspend. - -A: Do selective suspend during runtime power management, that's okay. But -it's useless for suspend-to-disk. (And I do not see how you could use -it for suspend-to-ram, I hope you do not want that). - -Lets see, so you suggest to - -* SUSPEND all but swap device and parents -* Snapshot -* Write image to disk -* SUSPEND swap device and parents -* Powerdown - -Oh no, that does not work, if swap device or its parents uses DMA, -you've corrupted data. You'd have to do - -* SUSPEND all but swap device and parents -* FREEZE swap device and parents -* Snapshot -* UNFREEZE swap device and parents -* Write -* SUSPEND swap device and parents - -Which means that you still need that FREEZE state, and you get more -complicated code. (And I have not yet introduce details like system -devices). - -Q: There don't seem to be any generally useful behavioral -distinctions between SUSPEND and FREEZE. - -A: Doing SUSPEND when you are asked to do FREEZE is always correct, -but it may be unnecessarily slow. If you want your driver to stay simple, -slowness may not matter to you. It can always be fixed later. - -For devices like disk it does matter, you do not want to spindown for -FREEZE. - -Q: After resuming, system is paging heavily, leading to very bad interactivity. - -A: Try running - -cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file -do - test -f "$file" && cat "$file" > /dev/null -done - -after resume. swapoff -a; swapon -a may also be useful. - -Q: What happens to devices during swsusp? They seem to be resumed -during system suspend? - -A: That's correct. We need to resume them if we want to write image to -disk. Whole sequence goes like - - Suspend part - ~~~~~~~~~~~~ - running system, user asks for suspend-to-disk - - user processes are stopped - - suspend(PMSG_FREEZE): devices are frozen so that they don't interfere - with state snapshot - - state snapshot: copy of whole used memory is taken with interrupts disabled - - resume(): devices are woken up so that we can write image to swap - - write image to swap - - suspend(PMSG_SUSPEND): suspend devices so that we can power off - - turn the power off - - Resume part - ~~~~~~~~~~~ - (is actually pretty similar) - - running system, user asks for suspend-to-disk - - user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows) - - read image from disk - - suspend(PMSG_FREEZE): devices are frozen so that they don't interfere - with image restoration - - image restoration: rewrite memory with image - - resume(): devices are woken up so that system can continue - - thaw all user processes - -Q: What is this 'Encrypt suspend image' for? - -A: First of all: it is not a replacement for dm-crypt encrypted swap. -It cannot protect your computer while it is suspended. Instead it does -protect from leaking sensitive data after resume from suspend. - -Think of the following: you suspend while an application is running -that keeps sensitive data in memory. The application itself prevents -the data from being swapped out. Suspend, however, must write these -data to swap to be able to resume later on. Without suspend encryption -your sensitive data are then stored in plaintext on disk. This means -that after resume your sensitive data are accessible to all -applications having direct access to the swap device which was used -for suspend. If you don't need swap after resume these data can remain -on disk virtually forever. Thus it can happen that your system gets -broken in weeks later and sensitive data which you thought were -encrypted and protected are retrieved and stolen from the swap device. -To prevent this situation you should use 'Encrypt suspend image'. - -During suspend a temporary key is created and this key is used to -encrypt the data written to disk. When, during resume, the data was -read back into memory the temporary key is destroyed which simply -means that all data written to disk during suspend are then -inaccessible so they can't be stolen later on. The only thing that -you must then take care of is that you call 'mkswap' for the swap -partition used for suspend as early as possible during regular -boot. This asserts that any temporary key from an oopsed suspend or -from a failed or aborted resume is erased from the swap device. - -As a rule of thumb use encrypted swap to protect your data while your -system is shut down or suspended. Additionally use the encrypted -suspend image to prevent sensitive data from being stolen after -resume. - -Q: Can I suspend to a swap file? - -A: Generally, yes, you can. However, it requires you to use the "resume=" and -"resume_offset=" kernel command line parameters, so the resume from a swap file -cannot be initiated from an initrd or initramfs image. See -swsusp-and-swap-files.txt for details. - -Q: Is there a maximum system RAM size that is supported by swsusp? - -A: It should work okay with highmem. - -Q: Does swsusp (to disk) use only one swap partition or can it use -multiple swap partitions (aggregate them into one logical space)? - -A: Only one swap partition, sorry. - -Q: If my application(s) causes lots of memory & swap space to be used -(over half of the total system RAM), is it correct that it is likely -to be useless to try to suspend to disk while that app is running? - -A: No, it should work okay, as long as your app does not mlock() -it. Just prepare big enough swap partition. - -Q: What information is useful for debugging suspend-to-disk problems? - -A: Well, last messages on the screen are always useful. If something -is broken, it is usually some kernel driver, therefore trying with as -little as possible modules loaded helps a lot. I also prefer people to -suspend from console, preferably without X running. Booting with -init=/bin/bash, then swapon and starting suspend sequence manually -usually does the trick. Then it is good idea to try with latest -vanilla kernel. - -Q: How can distributions ship a swsusp-supporting kernel with modular -disk drivers (especially SATA)? - -A: Well, it can be done, load the drivers, then do echo into -/sys/power/resume file from initrd. Be sure not to mount -anything, not even read-only mount, or you are going to lose your -data. - -Q: How do I make suspend more verbose? - -A: If you want to see any non-error kernel messages on the virtual -terminal the kernel switches to during suspend, you have to set the -kernel console loglevel to at least 4 (KERN_WARNING), for example by -doing - - # save the old loglevel - read LOGLEVEL DUMMY < /proc/sys/kernel/printk - # set the loglevel so we see the progress bar. - # if the level is higher than needed, we leave it alone. - if [ $LOGLEVEL -lt 5 ]; then - echo 5 > /proc/sys/kernel/printk - fi - - IMG_SZ=0 - read IMG_SZ < /sys/power/image_size - echo -n disk > /sys/power/state - RET=$? - # - # the logic here is: - # if image_size > 0 (without kernel support, IMG_SZ will be zero), - # then try again with image_size set to zero. - if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size - echo 0 > /sys/power/image_size - echo -n disk > /sys/power/state - RET=$? - fi - - # restore previous loglevel - echo $LOGLEVEL > /proc/sys/kernel/printk - exit $RET - -Q: Is this true that if I have a mounted filesystem on a USB device and -I suspend to disk, I can lose data unless the filesystem has been mounted -with "sync"? - -A: That's right ... if you disconnect that device, you may lose data. -In fact, even with "-o sync" you can lose data if your programs have -information in buffers they haven't written out to a disk you disconnect, -or if you disconnect before the device finished saving data you wrote. - -Software suspend normally powers down USB controllers, which is equivalent -to disconnecting all USB devices attached to your system. - -Your system might well support low-power modes for its USB controllers -while the system is asleep, maintaining the connection, using true sleep -modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the -/sys/power/state file; write "standby" or "mem".) We've not seen any -hardware that can use these modes through software suspend, although in -theory some systems might support "platform" modes that won't break the -USB connections. - -Remember that it's always a bad idea to unplug a disk drive containing a -mounted filesystem. That's true even when your system is asleep! The -safest thing is to unmount all filesystems on removable media (such USB, -Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays) -before suspending; then remount them after resuming. - -There is a work-around for this problem. For more information, see -Documentation/driver-api/usb/persist.rst. - -Q: Can I suspend-to-disk using a swap partition under LVM? - -A: Yes and No. You can suspend successfully, but the kernel will not be able -to resume on its own. You need an initramfs that can recognize the resume -situation, activate the logical volume containing the swap volume (but not -touch any filesystems!), and eventually call - -echo -n "$major:$minor" > /sys/power/resume - -where $major and $minor are the respective major and minor device numbers of -the swap volume. - -uswsusp works with LVM, too. See http://suspend.sourceforge.net/ - -Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were -compiled with the similar configuration files. Anyway I found that -suspend to disk (and resume) is much slower on 2.6.16 compared to -2.6.15. Any idea for why that might happen or how can I speed it up? - -A: This is because the size of the suspend image is now greater than -for 2.6.15 (by saving more data we can get more responsive system -after resume). - -There's the /sys/power/image_size knob that controls the size of the -image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as -root), the 2.6.15 behavior should be restored. If it is still too -slow, take a look at suspend.sf.net -- userland suspend is faster and -supports LZF compression to speed it up further. diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst new file mode 100644 index 000000000000..ca787f142c3f --- /dev/null +++ b/Documentation/power/tricks.rst @@ -0,0 +1,29 @@ +================ +swsusp/S3 tricks +================ + +Pavel Machek + +If you want to trick swsusp/S3 into working, you might want to try: + +* go with minimal config, turn off drivers like USB, AGP you don't + really need + +* turn off APIC and preempt + +* use ext2. At least it has working fsck. [If something seems to go + wrong, force fsck when you have a chance] + +* turn off modules + +* use vga text console, shut down X. [If you really want X, you might + want to try vesafb later] + +* try running as few processes as possible, preferably go to single + user mode. + +* due to video issues, swsusp should be easier to get working than + S3. Try that first. + +When you make it work, try to find out what exactly was it that broke +suspend, and preferably fix that. diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt deleted file mode 100644 index a1b8f7249f4c..000000000000 --- a/Documentation/power/tricks.txt +++ /dev/null @@ -1,27 +0,0 @@ - swsusp/S3 tricks - ~~~~~~~~~~~~~~~~ -Pavel Machek - -If you want to trick swsusp/S3 into working, you might want to try: - -* go with minimal config, turn off drivers like USB, AGP you don't - really need - -* turn off APIC and preempt - -* use ext2. At least it has working fsck. [If something seems to go - wrong, force fsck when you have a chance] - -* turn off modules - -* use vga text console, shut down X. [If you really want X, you might - want to try vesafb later] - -* try running as few processes as possible, preferably go to single - user mode. - -* due to video issues, swsusp should be easier to get working than - S3. Try that first. - -When you make it work, try to find out what exactly was it that broke -suspend, and preferably fix that. diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst new file mode 100644 index 000000000000..a0fa51bb1a4d --- /dev/null +++ b/Documentation/power/userland-swsusp.rst @@ -0,0 +1,191 @@ +===================================================== +Documentation for userland software suspend interface +===================================================== + + (C) 2006 Rafael J. Wysocki + +First, the warnings at the beginning of swsusp.txt still apply. + +Second, you should read the FAQ in swsusp.txt _now_ if you have not +done it already. + +Now, to use the userland interface for software suspend you need special +utilities that will read/write the system memory snapshot from/to the +kernel. Such utilities are available, for example, from +. You may want to have a look at them if you +are going to develop your own suspend/resume utilities. + +The interface consists of a character device providing the open(), +release(), read(), and write() operations as well as several ioctl() +commands defined in include/linux/suspend_ioctls.h . The major and minor +numbers of the device are, respectively, 10 and 231, and they can +be read from /sys/class/misc/snapshot/dev. + +The device can be open either for reading or for writing. If open for +reading, it is considered to be in the suspend mode. Otherwise it is +assumed to be in the resume mode. The device cannot be open for simultaneous +reading and writing. It is also impossible to have the device open more than +once at a time. + +Even opening the device has side effects. Data structures are +allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are +called. + +The ioctl() commands recognized by the device are: + +SNAPSHOT_FREEZE + freeze user space processes (the current process is + not frozen); this is required for SNAPSHOT_CREATE_IMAGE + and SNAPSHOT_ATOMIC_RESTORE to succeed + +SNAPSHOT_UNFREEZE + thaw user space processes frozen by SNAPSHOT_FREEZE + +SNAPSHOT_CREATE_IMAGE + create a snapshot of the system memory; the + last argument of ioctl() should be a pointer to an int variable, + the value of which will indicate whether the call returned after + creating the snapshot (1) or after restoring the system memory state + from it (0) (after resume the system finds itself finishing the + SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot + has been created the read() operation can be used to transfer + it out of the kernel + +SNAPSHOT_ATOMIC_RESTORE + restore the system memory state from the + uploaded snapshot image; before calling it you should transfer + the system memory snapshot back to the kernel using the write() + operation; this call will not succeed if the snapshot + image is not available to the kernel + +SNAPSHOT_FREE + free memory allocated for the snapshot image + +SNAPSHOT_PREF_IMAGE_SIZE + set the preferred maximum size of the image + (the kernel will do its best to ensure the image size will not exceed + this number, but if it turns out to be impossible, the kernel will + create the smallest image possible) + +SNAPSHOT_GET_IMAGE_SIZE + return the actual size of the hibernation image + +SNAPSHOT_AVAIL_SWAP_SIZE + return the amount of available swap in bytes (the + last argument should be a pointer to an unsigned int variable that will + contain the result if the call is successful). + +SNAPSHOT_ALLOC_SWAP_PAGE + allocate a swap page from the resume partition + (the last argument should be a pointer to a loff_t variable that + will contain the swap page offset if the call is successful) + +SNAPSHOT_FREE_SWAP_PAGES + free all swap pages allocated by + SNAPSHOT_ALLOC_SWAP_PAGE + +SNAPSHOT_SET_SWAP_AREA + set the resume partition and the offset (in + units) from the beginning of the partition at which the swap header is + located (the last ioctl() argument should point to a struct + resume_swap_area, as defined in kernel/power/suspend_ioctls.h, + containing the resume device specification and the offset); for swap + partitions the offset is always 0, but it is different from zero for + swap files (see Documentation/power/swsusp-and-swap-files.rst for + details). + +SNAPSHOT_PLATFORM_SUPPORT + enable/disable the hibernation platform support, + depending on the argument value (enable, if the argument is nonzero) + +SNAPSHOT_POWER_OFF + make the kernel transition the system to the hibernation + state (eg. ACPI S4) using the platform (eg. ACPI) driver + +SNAPSHOT_S2RAM + suspend to RAM; using this call causes the kernel to + immediately enter the suspend-to-RAM state, so this call must always + be preceded by the SNAPSHOT_FREEZE call and it is also necessary + to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call + is needed to implement the suspend-to-both mechanism in which the + suspend image is first created, as though the system had been suspended + to disk, and then the system is suspended to RAM (this makes it possible + to resume the system from RAM if there's enough battery power or restore + its state on the basis of the saved suspend image otherwise) + +The device's read() operation can be used to transfer the snapshot image from +the kernel. It has the following limitations: + +- you cannot read() more than one virtual memory page at a time +- read()s across page boundaries are impossible (ie. if you read() 1/2 of + a page in the previous call, you will only be able to read() + **at most** 1/2 of the page in the next call) + +The device's write() operation is used for uploading the system memory snapshot +into the kernel. It has the same limitations as the read() operation. + +The release() operation frees all memory allocated for the snapshot image +and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any). +Thus it is not necessary to use either SNAPSHOT_FREE or +SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also +unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are +still frozen when the device is being closed). + +Currently it is assumed that the userland utilities reading/writing the +snapshot image from/to the kernel will use a swap partition, called the resume +partition, or a swap file as storage space (if a swap file is used, the resume +partition is the partition that holds this file). However, this is not really +required, as they can use, for example, a special (blank) suspend partition or +a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and +mounted afterwards. + +These utilities MUST NOT make any assumptions regarding the ordering of +data within the snapshot image. The contents of the image are entirely owned +by the kernel and its structure may be changed in future kernel releases. + +The snapshot image MUST be written to the kernel unaltered (ie. all of the image +data, metadata and header MUST be written in _exactly_ the same amount, form +and order in which they have been read). Otherwise, the behavior of the +resumed system may be totally unpredictable. + +While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the +structure of the snapshot image is consistent with the information stored +in the image header. If any inconsistencies are detected, +SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof +mechanism and the userland utilities using the interface SHOULD use additional +means, such as checksums, to ensure the integrity of the snapshot image. + +The suspending and resuming utilities MUST lock themselves in memory, +preferably using mlockall(), before calling SNAPSHOT_FREEZE. + +The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE +in the memory location pointed to by the last argument of ioctl() and proceed +in accordance with it: + +1. If the value is 1 (ie. the system memory snapshot has just been + created and the system is ready for saving it): + + (a) The suspending utility MUST NOT close the snapshot device + _unless_ the whole suspend procedure is to be cancelled, in + which case, if the snapshot image has already been saved, the + suspending utility SHOULD destroy it, preferably by zapping + its header. If the suspend is not to be cancelled, the + system MUST be powered off or rebooted after the snapshot + image has been saved. + (b) The suspending utility SHOULD NOT attempt to perform any + file system operations (including reads) on the file systems + that were mounted before SNAPSHOT_CREATE_IMAGE has been + called. However, it MAY mount a file system that was not + mounted at that time and perform some operations on it (eg. + use it for saving the image). + +2. If the value is 0 (ie. the system state has just been restored from + the snapshot image), the suspending utility MUST close the snapshot + device. Afterwards it will be treated as a regular userland process, + so it need not exit. + +The resuming utility SHOULD NOT attempt to mount any file systems that could +be mounted before suspend and SHOULD NOT attempt to perform any operations +involving such file systems. + +For details, please refer to the source code. diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt deleted file mode 100644 index bbfcd1bbedc5..000000000000 --- a/Documentation/power/userland-swsusp.txt +++ /dev/null @@ -1,170 +0,0 @@ -Documentation for userland software suspend interface - (C) 2006 Rafael J. Wysocki - -First, the warnings at the beginning of swsusp.txt still apply. - -Second, you should read the FAQ in swsusp.txt _now_ if you have not -done it already. - -Now, to use the userland interface for software suspend you need special -utilities that will read/write the system memory snapshot from/to the -kernel. Such utilities are available, for example, from -. You may want to have a look at them if you -are going to develop your own suspend/resume utilities. - -The interface consists of a character device providing the open(), -release(), read(), and write() operations as well as several ioctl() -commands defined in include/linux/suspend_ioctls.h . The major and minor -numbers of the device are, respectively, 10 and 231, and they can -be read from /sys/class/misc/snapshot/dev. - -The device can be open either for reading or for writing. If open for -reading, it is considered to be in the suspend mode. Otherwise it is -assumed to be in the resume mode. The device cannot be open for simultaneous -reading and writing. It is also impossible to have the device open more than -once at a time. - -Even opening the device has side effects. Data structures are -allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are -called. - -The ioctl() commands recognized by the device are: - -SNAPSHOT_FREEZE - freeze user space processes (the current process is - not frozen); this is required for SNAPSHOT_CREATE_IMAGE - and SNAPSHOT_ATOMIC_RESTORE to succeed - -SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE - -SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the - last argument of ioctl() should be a pointer to an int variable, - the value of which will indicate whether the call returned after - creating the snapshot (1) or after restoring the system memory state - from it (0) (after resume the system finds itself finishing the - SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot - has been created the read() operation can be used to transfer - it out of the kernel - -SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the - uploaded snapshot image; before calling it you should transfer - the system memory snapshot back to the kernel using the write() - operation; this call will not succeed if the snapshot - image is not available to the kernel - -SNAPSHOT_FREE - free memory allocated for the snapshot image - -SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image - (the kernel will do its best to ensure the image size will not exceed - this number, but if it turns out to be impossible, the kernel will - create the smallest image possible) - -SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image - -SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the - last argument should be a pointer to an unsigned int variable that will - contain the result if the call is successful). - -SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition - (the last argument should be a pointer to a loff_t variable that - will contain the swap page offset if the call is successful) - -SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by - SNAPSHOT_ALLOC_SWAP_PAGE - -SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in - units) from the beginning of the partition at which the swap header is - located (the last ioctl() argument should point to a struct - resume_swap_area, as defined in kernel/power/suspend_ioctls.h, - containing the resume device specification and the offset); for swap - partitions the offset is always 0, but it is different from zero for - swap files (see Documentation/power/swsusp-and-swap-files.txt for - details). - -SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support, - depending on the argument value (enable, if the argument is nonzero) - -SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation - state (eg. ACPI S4) using the platform (eg. ACPI) driver - -SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to - immediately enter the suspend-to-RAM state, so this call must always - be preceded by the SNAPSHOT_FREEZE call and it is also necessary - to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call - is needed to implement the suspend-to-both mechanism in which the - suspend image is first created, as though the system had been suspended - to disk, and then the system is suspended to RAM (this makes it possible - to resume the system from RAM if there's enough battery power or restore - its state on the basis of the saved suspend image otherwise) - -The device's read() operation can be used to transfer the snapshot image from -the kernel. It has the following limitations: -- you cannot read() more than one virtual memory page at a time -- read()s across page boundaries are impossible (ie. if you read() 1/2 of - a page in the previous call, you will only be able to read() - _at_ _most_ 1/2 of the page in the next call) - -The device's write() operation is used for uploading the system memory snapshot -into the kernel. It has the same limitations as the read() operation. - -The release() operation frees all memory allocated for the snapshot image -and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any). -Thus it is not necessary to use either SNAPSHOT_FREE or -SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also -unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are -still frozen when the device is being closed). - -Currently it is assumed that the userland utilities reading/writing the -snapshot image from/to the kernel will use a swap partition, called the resume -partition, or a swap file as storage space (if a swap file is used, the resume -partition is the partition that holds this file). However, this is not really -required, as they can use, for example, a special (blank) suspend partition or -a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and -mounted afterwards. - -These utilities MUST NOT make any assumptions regarding the ordering of -data within the snapshot image. The contents of the image are entirely owned -by the kernel and its structure may be changed in future kernel releases. - -The snapshot image MUST be written to the kernel unaltered (ie. all of the image -data, metadata and header MUST be written in _exactly_ the same amount, form -and order in which they have been read). Otherwise, the behavior of the -resumed system may be totally unpredictable. - -While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the -structure of the snapshot image is consistent with the information stored -in the image header. If any inconsistencies are detected, -SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof -mechanism and the userland utilities using the interface SHOULD use additional -means, such as checksums, to ensure the integrity of the snapshot image. - -The suspending and resuming utilities MUST lock themselves in memory, -preferably using mlockall(), before calling SNAPSHOT_FREEZE. - -The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE -in the memory location pointed to by the last argument of ioctl() and proceed -in accordance with it: -1. If the value is 1 (ie. the system memory snapshot has just been - created and the system is ready for saving it): - (a) The suspending utility MUST NOT close the snapshot device - _unless_ the whole suspend procedure is to be cancelled, in - which case, if the snapshot image has already been saved, the - suspending utility SHOULD destroy it, preferably by zapping - its header. If the suspend is not to be cancelled, the - system MUST be powered off or rebooted after the snapshot - image has been saved. - (b) The suspending utility SHOULD NOT attempt to perform any - file system operations (including reads) on the file systems - that were mounted before SNAPSHOT_CREATE_IMAGE has been - called. However, it MAY mount a file system that was not - mounted at that time and perform some operations on it (eg. - use it for saving the image). -2. If the value is 0 (ie. the system state has just been restored from - the snapshot image), the suspending utility MUST close the snapshot - device. Afterwards it will be treated as a regular userland process, - so it need not exit. - -The resuming utility SHOULD NOT attempt to mount any file systems that could -be mounted before suspend and SHOULD NOT attempt to perform any operations -involving such file systems. - -For details, please refer to the source code. diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst new file mode 100644 index 000000000000..337a2ba9f32f --- /dev/null +++ b/Documentation/power/video.rst @@ -0,0 +1,213 @@ +=========================== +Video issues with S3 resume +=========================== + +2003-2006, Pavel Machek + +During S3 resume, hardware needs to be reinitialized. For most +devices, this is easy, and kernel driver knows how to do +it. Unfortunately there's one exception: video card. Those are usually +initialized by BIOS, and kernel does not have enough information to +boot video card. (Kernel usually does not even contain video card +driver -- vesafb and vgacon are widely used). + +This is not problem for swsusp, because during swsusp resume, BIOS is +run normally so video card is normally initialized. It should not be +problem for S1 standby, because hardware should retain its state over +that. + +We either have to run video BIOS during early resume, or interpret it +using vbetool later, or maybe nothing is necessary on particular +system because video state is preserved. Unfortunately different +methods work on different systems, and no known method suits all of +them. + +Userland application called s2ram has been developed; it contains long +whitelist of systems, and automatically selects working method for a +given system. It can be downloaded from CVS at +www.sf.net/projects/suspend . If you get a system that is not in the +whitelist, please try to find a working solution, and submit whitelist +entry so that work does not need to be repeated. + +Currently, VBE_SAVE method (6 below) works on most +systems. Unfortunately, vbetool only runs after userland is resumed, +so it makes debugging of early resume problems +hard/impossible. Methods that do not rely on userland are preferable. + +Details +~~~~~~~ + +There are a few types of systems where video works after S3 resume: + +(1) systems where video state is preserved over S3. + +(2) systems where it is possible to call the video BIOS during S3 + resume. Unfortunately, it is not correct to call the video BIOS at + that point, but it happens to work on some machines. Use + acpi_sleep=s3_bios. + +(3) systems that initialize video card into vga text mode and where + the BIOS works well enough to be able to set video mode. Use + acpi_sleep=s3_mode on these. + +(4) on some systems s3_bios kicks video into text mode, and + acpi_sleep=s3_bios,s3_mode is needed. + +(5) radeon systems, where X can soft-boot your video card. You'll need + a new enough X, and a plain text console (no vesafb or radeonfb). See + http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information. + Alternatively, you should use vbetool (6) instead. + +(6) other radeon systems, where vbetool is enough to bring system back + to life. It needs text console to be working. Do vbetool vbestate + save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool + vbestate restore < /tmp/delme; setfont , and your video + should work. + +(7) on some systems, it is possible to boot most of kernel, and then + POSTing bios works. Ole Rohne has patch to do just that at + http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2. + +(8) on some systems, you can use the video_post utility and or + do echo 3 > /sys/power/state && /usr/sbin/video_post - which will + initialize the display in console mode. If you are in X, you can switch + to a virtual terminal and back to X using CTRL+ALT+F1 - CTRL+ALT+F7 to get + the display working in graphical mode again. + +Now, if you pass acpi_sleep=something, and it does not work with your +bios, you'll get a hard crash during resume. Be careful. Also it is +safest to do your experiments with plain old VGA console. The vesafb +and radeonfb (etc) drivers have a tendency to crash the machine during +resume. + +You may have a system where none of above works. At that point you +either invent another ugly hack that works, or write proper driver for +your video card (good luck getting docs :-(). Maybe suspending from X +(proper X, knowing your hardware, not XF68_FBcon) might have better +chance of working. + +Table of known working notebooks: + + +=============================== =============================================== +Model hack (or "how to do it") +=============================== =============================================== +Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI +Acer TM 230 s3_bios (2) +Acer TM 242FX vbetool (6) +Acer TM C110 video_post (8) +Acer TM C300 vga=normal (only suspend on console, not in X), + vbetool (6) or video_post (8) +Acer TM 4052LCi s3_bios (2) +Acer TM 636Lci s3_bios,s3_mode (4) +Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text + console back +Acer TM 660 ??? [#f1]_ +Acer TM 800 vga=normal, X patches, see webpage (5) + or vbetool (6) +Acer TM 803 vga=normal, X patches, see webpage (5) + or vbetool (6) +Acer TM 803LCi vga=normal, vbetool (6) +Arima W730a vbetool needed (6) +Asus L2400D s3_mode (3) [#f2]_ (S1 also works OK) +Asus L3350M (SiS 740) (6) +Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK) +Asus M6887Ne vga=normal, s3_bios (2), use radeon driver + instead of fglrx in x.org +Athlon64 desktop prototype s3_bios (2) +Compal CL-50 ??? [#f1]_ +Compaq Armada E500 - P3-700 none (1) (S1 also works OK) +Compaq Evo N620c vga=normal, s3_bios (2) +Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1 +Dell D600, ATI RV250 vga=normal and X, or try vbestate (6) +Dell D610 vga=normal and X (possibly vbestate (6) too, + but not tested) +Dell Inspiron 4000 ??? [#f1]_ +Dell Inspiron 500m ??? [#f1]_ +Dell Inspiron 510m ??? +Dell Inspiron 5150 vbetool needed (6) +Dell Inspiron 600m ??? [#f1]_ +Dell Inspiron 8200 ??? [#f1]_ +Dell Inspiron 8500 ??? [#f1]_ +Dell Inspiron 8600 ??? [#f1]_ +eMachines athlon64 machines vbetool needed (6) (someone please get + me model #s) +HP NC6000 s3_bios, may not use radeonfb (2); + or vbetool (6) +HP NX7000 ??? [#f1]_ +HP Pavilion ZD7000 vbetool post needed, need open-source nv + driver for X +HP Omnibook XE3 athlon version none (1) +HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV +HP Omnibook XE3L-GF vbetool (6) +HP Omnibook 5150 none (1), (S1 also works OK) +IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 + Savage/IX-MV, vesafb gets "interesting" + but X work. +IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with + BIOS 1.04 2002-08-23, but not at all with + BIOS 1.11 2004-11-05 :-(] +IBM TP R32 / Type 2658-MMG none (1) +IBM TP R40 2722B3G ??? [#f1]_ +IBM TP R50p / Type 1832-22U s3_bios (2) +IBM TP R51 none (1) +IBM TP T30 236681A ??? [#f1]_ +IBM TP T40 / Type 2373-MU4 none (1) +IBM TP T40p none (1) +IBM TP R40p s3_bios (2) +IBM TP T41p s3_bios (2), switch to X after resume +IBM TP T42 s3_bios (2) +IBM ThinkPad T42p (2373-GTG) s3_bios (2) +IBM TP X20 ??? [#f1]_ +IBM TP X30 s3_bios, s3_mode (4) +IBM TP X31 / Type 2672-XXH none (1), use radeontool + (http://fdd.com/software/radeon/) to + turn off backlight. +IBM TP X32 none (1), but backlight is on and video is + trashed after long suspend. s3_bios, + s3_mode (4) works too. Perhaps that gets + better results? +IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4) +IBM TP 600e none(1), but a switch to console and + back to X is needed +Medion MD4220 ??? [#f1]_ +Samsung P35 vbetool needed (6) +Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off +Sony Vaio PCG-C1VRX/K s3_bios (2) +Sony Vaio PCG-F403 ??? [#f1]_ +Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver +Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use + radeontool (http://fdd.com/software/radeon/) + to turn off backlight. +Sony Vaio PCG-N505SN ??? [#f1]_ +Sony Vaio vgn-s260 X or boot-radeon can init it (5) +Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will + be blank unless you return to X. +Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4) +Toshiba Libretto L5 none (1) +Toshiba Libretto 100CT/110CT vbetool (6) +Toshiba Portege 3020CT s3_mode (3) +Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK) +Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK) +Toshiba Satellite 4090XCDT ??? [#f1]_ +Toshiba Satellite P10-554 s3_bios,s3_mode (4)[#f3]_ +Toshiba M30 (2) xor X with nvidia driver using internal AGP +Uniwill 244IIO ??? [#f1]_ +=============================== =============================================== + +Known working desktop systems +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +=================== ============================= ======================== +Mainboard Graphics card hack (or "how to do it") +=================== ============================= ======================== +Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) +=================== ============================= ======================== + + +.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure + which options to use. If you know, please tell me. + +.. [#f2] To be tested with a newer kernel. + +.. [#f3] Not with SMP kernel, UP only. diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt deleted file mode 100644 index 3e6272bc4472..000000000000 --- a/Documentation/power/video.txt +++ /dev/null @@ -1,185 +0,0 @@ - - Video issues with S3 resume - ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - 2003-2006, Pavel Machek - -During S3 resume, hardware needs to be reinitialized. For most -devices, this is easy, and kernel driver knows how to do -it. Unfortunately there's one exception: video card. Those are usually -initialized by BIOS, and kernel does not have enough information to -boot video card. (Kernel usually does not even contain video card -driver -- vesafb and vgacon are widely used). - -This is not problem for swsusp, because during swsusp resume, BIOS is -run normally so video card is normally initialized. It should not be -problem for S1 standby, because hardware should retain its state over -that. - -We either have to run video BIOS during early resume, or interpret it -using vbetool later, or maybe nothing is necessary on particular -system because video state is preserved. Unfortunately different -methods work on different systems, and no known method suits all of -them. - -Userland application called s2ram has been developed; it contains long -whitelist of systems, and automatically selects working method for a -given system. It can be downloaded from CVS at -www.sf.net/projects/suspend . If you get a system that is not in the -whitelist, please try to find a working solution, and submit whitelist -entry so that work does not need to be repeated. - -Currently, VBE_SAVE method (6 below) works on most -systems. Unfortunately, vbetool only runs after userland is resumed, -so it makes debugging of early resume problems -hard/impossible. Methods that do not rely on userland are preferable. - -Details -~~~~~~~ - -There are a few types of systems where video works after S3 resume: - -(1) systems where video state is preserved over S3. - -(2) systems where it is possible to call the video BIOS during S3 - resume. Unfortunately, it is not correct to call the video BIOS at - that point, but it happens to work on some machines. Use - acpi_sleep=s3_bios. - -(3) systems that initialize video card into vga text mode and where - the BIOS works well enough to be able to set video mode. Use - acpi_sleep=s3_mode on these. - -(4) on some systems s3_bios kicks video into text mode, and - acpi_sleep=s3_bios,s3_mode is needed. - -(5) radeon systems, where X can soft-boot your video card. You'll need - a new enough X, and a plain text console (no vesafb or radeonfb). See - http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information. - Alternatively, you should use vbetool (6) instead. - -(6) other radeon systems, where vbetool is enough to bring system back - to life. It needs text console to be working. Do vbetool vbestate - save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool - vbestate restore < /tmp/delme; setfont , and your video - should work. - -(7) on some systems, it is possible to boot most of kernel, and then - POSTing bios works. Ole Rohne has patch to do just that at - http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2. - -(8) on some systems, you can use the video_post utility and or - do echo 3 > /sys/power/state && /usr/sbin/video_post - which will - initialize the display in console mode. If you are in X, you can switch - to a virtual terminal and back to X using CTRL+ALT+F1 - CTRL+ALT+F7 to get - the display working in graphical mode again. - -Now, if you pass acpi_sleep=something, and it does not work with your -bios, you'll get a hard crash during resume. Be careful. Also it is -safest to do your experiments with plain old VGA console. The vesafb -and radeonfb (etc) drivers have a tendency to crash the machine during -resume. - -You may have a system where none of above works. At that point you -either invent another ugly hack that works, or write proper driver for -your video card (good luck getting docs :-(). Maybe suspending from X -(proper X, knowing your hardware, not XF68_FBcon) might have better -chance of working. - -Table of known working notebooks: - -Model hack (or "how to do it") ------------------------------------------------------------------------------- -Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI -Acer TM 230 s3_bios (2) -Acer TM 242FX vbetool (6) -Acer TM C110 video_post (8) -Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8) -Acer TM 4052LCi s3_bios (2) -Acer TM 636Lci s3_bios,s3_mode (4) -Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back -Acer TM 660 ??? (*) -Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6) -Acer TM 803 vga=normal, X patches, see webpage (5) or vbetool (6) -Acer TM 803LCi vga=normal, vbetool (6) -Arima W730a vbetool needed (6) -Asus L2400D s3_mode (3)(***) (S1 also works OK) -Asus L3350M (SiS 740) (6) -Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK) -Asus M6887Ne vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org -Athlon64 desktop prototype s3_bios (2) -Compal CL-50 ??? (*) -Compaq Armada E500 - P3-700 none (1) (S1 also works OK) -Compaq Evo N620c vga=normal, s3_bios (2) -Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1 -Dell D600, ATI RV250 vga=normal and X, or try vbestate (6) -Dell D610 vga=normal and X (possibly vbestate (6) too, but not tested) -Dell Inspiron 4000 ??? (*) -Dell Inspiron 500m ??? (*) -Dell Inspiron 510m ??? -Dell Inspiron 5150 vbetool needed (6) -Dell Inspiron 600m ??? (*) -Dell Inspiron 8200 ??? (*) -Dell Inspiron 8500 ??? (*) -Dell Inspiron 8600 ??? (*) -eMachines athlon64 machines vbetool needed (6) (someone please get me model #s) -HP NC6000 s3_bios, may not use radeonfb (2); or vbetool (6) -HP NX7000 ??? (*) -HP Pavilion ZD7000 vbetool post needed, need open-source nv driver for X -HP Omnibook XE3 athlon version none (1) -HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV -HP Omnibook XE3L-GF vbetool (6) -HP Omnibook 5150 none (1), (S1 also works OK) -IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work. -IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(] -IBM TP R32 / Type 2658-MMG none (1) -IBM TP R40 2722B3G ??? (*) -IBM TP R50p / Type 1832-22U s3_bios (2) -IBM TP R51 none (1) -IBM TP T30 236681A ??? (*) -IBM TP T40 / Type 2373-MU4 none (1) -IBM TP T40p none (1) -IBM TP R40p s3_bios (2) -IBM TP T41p s3_bios (2), switch to X after resume -IBM TP T42 s3_bios (2) -IBM ThinkPad T42p (2373-GTG) s3_bios (2) -IBM TP X20 ??? (*) -IBM TP X30 s3_bios, s3_mode (4) -IBM TP X31 / Type 2672-XXH none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight. -IBM TP X32 none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results? -IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4) -IBM TP 600e none(1), but a switch to console and back to X is needed -Medion MD4220 ??? (*) -Samsung P35 vbetool needed (6) -Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off -Sony Vaio PCG-C1VRX/K s3_bios (2) -Sony Vaio PCG-F403 ??? (*) -Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver -Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight. -Sony Vaio PCG-N505SN ??? (*) -Sony Vaio vgn-s260 X or boot-radeon can init it (5) -Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will be blank unless you return to X. -Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4) -Toshiba Libretto L5 none (1) -Toshiba Libretto 100CT/110CT vbetool (6) -Toshiba Portege 3020CT s3_mode (3) -Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK) -Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK) -Toshiba Satellite 4090XCDT ??? (*) -Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****) -Toshiba M30 (2) xor X with nvidia driver using internal AGP -Uniwill 244IIO ??? (*) - -Known working desktop systems -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Mainboard Graphics card hack (or "how to do it") ------------------------------------------------------------------------------- -Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) - - -(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure - which options to use. If you know, please tell me. - -(***) To be tested with a newer kernel. - -(****) Not with SMP kernel, UP only. diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst index 58bc047e7b95..1acaa14903d6 100644 --- a/Documentation/process/submitting-drivers.rst +++ b/Documentation/process/submitting-drivers.rst @@ -117,7 +117,7 @@ PM support: implemented") error. You should also try to make sure that your driver uses as little power as possible when it's not doing anything. For the driver testing instructions see - Documentation/power/drivers-testing.txt and for a relatively + Documentation/power/drivers-testing.rst and for a relatively complete overview of the power management issues related to drivers see :ref:`Documentation/driver-api/pm/devices.rst `. diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt index 197d81f4b836..d97207b9accb 100644 --- a/Documentation/scheduler/sched-energy.txt +++ b/Documentation/scheduler/sched-energy.txt @@ -22,7 +22,7 @@ the highest. The actual EM used by EAS is _not_ maintained by the scheduler, but by a dedicated framework. For details about this framework and what it provides, -please refer to its documentation (see Documentation/power/energy-model.txt). +please refer to its documentation (see Documentation/power/energy-model.rst). 2. Background and Terminology @@ -81,7 +81,7 @@ through the arch_scale_cpu_capacity() callback. The rest of platform knowledge used by EAS is directly read from the Energy Model (EM) framework. The EM of a platform is composed of a power cost table -per 'performance domain' in the system (see Documentation/power/energy-model.txt +per 'performance domain' in the system (see Documentation/power/energy-model.rst for futher details about performance domains). The scheduler manages references to the EM objects in the topology code when the @@ -352,7 +352,7 @@ could be amended in the future if proven otherwise. EAS uses the EM of a platform to estimate the impact of scheduling decisions on energy. So, your platform must provide power cost tables to the EM framework in order to make EAS start. To do so, please refer to documentation of the -independent EM framework in Documentation/power/energy-model.txt. +independent EM framework in Documentation/power/energy-model.rst. Please also note that the scheduling domains need to be re-built after the EM has been registered in order to start EAS. diff --git a/Documentation/trace/coresight-cpu-debug.txt b/Documentation/trace/coresight-cpu-debug.txt index f07e38094b40..1a660a39e3c0 100644 --- a/Documentation/trace/coresight-cpu-debug.txt +++ b/Documentation/trace/coresight-cpu-debug.txt @@ -151,7 +151,7 @@ At the runtime you can disable idle states with below methods: It is possible to disable CPU idle states by way of the PM QoS subsystem, more specifically by using the "/dev/cpu_dma_latency" -interface (see Documentation/power/pm_qos_interface.txt for more +interface (see Documentation/power/pm_qos_interface.rst for more details). As specified in the PM QoS documentation the requested parameter will stay in effect until the file descriptor is released. For example: diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst index 72c6cd935821..f1c3906c69a8 100644 --- a/Documentation/translations/zh_CN/process/submitting-drivers.rst +++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst @@ -97,7 +97,7 @@ Linux 2.6: 函数定义成返回 -ENOSYS(功能未实现)错误。你还应该尝试确 保你的驱动在什么都不干的情况下将耗电降到最低。要获得驱动 程序测试的指导,请参阅 - Documentation/power/drivers-testing.txt。有关驱动程序电 + Documentation/power/drivers-testing.rst。有关驱动程序电 源管理问题相对全面的概述,请参阅 Documentation/driver-api/pm/devices.rst。 diff --git a/MAINTAINERS b/MAINTAINERS index 9c382053ce6a..5a6137df3f0e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6446,7 +6446,7 @@ M: "Rafael J. Wysocki" M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported -F: Documentation/power/freezing-of-tasks.txt +F: Documentation/power/freezing-of-tasks.rst F: include/linux/freezer.h F: kernel/freezer.c @@ -11764,7 +11764,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git F: drivers/opp/ F: include/linux/pm_opp.h -F: Documentation/power/opp.txt +F: Documentation/power/opp.rst F: Documentation/devicetree/bindings/opp/ OPL4 DRIVER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..77a724771dbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2447,7 +2447,7 @@ menuconfig APM machines with more than one CPU. In order to use APM, you will need supporting software. For location - and more information, read + and more information, read and the Battery Powered Linux mini-HOWTO, available from . diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 066fd2a12851..10d040e2e807 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1175,7 +1175,7 @@ struct skl_wm_params { * to be disabled. This shouldn't happen and we'll print some error messages in * case it happens. * - * For more, read the Documentation/power/runtime_pm.txt. + * For more, read the Documentation/power/runtime_pm.rst. */ struct i915_runtime_pm { atomic_t wakeref_count; diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig index a7fbb93f302c..1f64a3d46c8a 100644 --- a/drivers/opp/Kconfig +++ b/drivers/opp/Kconfig @@ -10,4 +10,4 @@ config PM_OPP OPP layer organizes the data internally using device pointers representing individual voltage domains and provides SOC implementations a ready to use framework to manage OPPs. - For more information, read + For more information, read diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index f7033ecf6d0b..11f9c875b028 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -607,7 +607,7 @@ int power_supply_get_battery_info(struct power_supply *psy, /* The property and field names below must correspond to elements * in enum power_supply_property. For reasoning, see - * Documentation/power/power_supply_class.txt. + * Documentation/power/power_supply_class.rst. */ of_property_read_u32(battery_np, "energy-full-design-microwatt-hours", diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c7eef32e7739..5b8328a99b2a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -52,7 +52,7 @@ * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended - * state. See Documentation/power/suspend-and-interrupts.txt + * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device diff --git a/include/linux/pci.h b/include/linux/pci.h index b74b2a4e6df2..3d9a167ca5c3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -807,7 +807,7 @@ struct module; * @suspend_late: Put device into low power state. * @resume_early: Wake device from low power state. * @resume: Wake device from low power state. - * (Please see Documentation/power/pci.txt for descriptions + * (Please see Documentation/power/pci.rst for descriptions * of PCI Power Management and the related functions.) * @shutdown: Hook into reboot_notifier_list (kernel/sys.c). * Intended to stop any idling DMA operations. diff --git a/include/linux/pm.h b/include/linux/pm.h index 66c19a65a514..c14ad8bc1a41 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -284,7 +284,7 @@ typedef struct pm_message { * actions to be performed by a device driver's callbacks generally depend on * the platform and subsystem the device belongs to. * - * Refer to Documentation/power/runtime_pm.txt for more information about the + * Refer to Documentation/power/runtime_pm.rst for more information about the * role of the @runtime_suspend(), @runtime_resume() and @runtime_idle() * callbacks in device runtime power management. */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9bbaaab14b36..7a4dda9e5309 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -65,7 +65,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - ). + ). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -74,7 +74,7 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at . + For more information take a look at . config ARCH_SAVE_PAGE_KEYS bool @@ -255,7 +255,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read + and more information, read and the Battery Powered Linux mini-HOWTO, available from . diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 41722046b937..0cd26289bfbc 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -165,7 +165,7 @@ config CFG80211_DEFAULT_PS If this causes your applications to misbehave you should fix your applications instead -- they need to register their network - latency requirement, see Documentation/power/pm_qos_interface.txt. + latency requirement, see Documentation/power/pm_qos_interface.rst. config CFG80211_DEBUGFS bool "cfg80211 DebugFS entries" -- cgit v1.2.3-55-g7522 From 514caf23a70fd697fa2ece238b2cd8dcc73fb16f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:13 +0200 Subject: memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag Add a flags field to struct dev_pagemap to replace the altmap_valid boolean to be a little more extensible. Also add a pgmap_altmap() helper to find the optional altmap and clean up the code using the altmap using it. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- arch/powerpc/mm/mem.c | 10 +--------- arch/x86/mm/init_64.c | 8 ++------ drivers/nvdimm/pfn_devs.c | 3 +-- drivers/nvdimm/pmem.c | 1 - include/linux/memremap.h | 12 +++++++++++- kernel/memremap.c | 26 ++++++++++---------------- mm/hmm.c | 1 - mm/memory_hotplug.c | 6 ++---- mm/page_alloc.c | 5 ++--- 9 files changed, 29 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2540d3b2588c..a2923c5c1982 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); int ret; - /* - * If we have an altmap then we need to skip over any reserved PFNs - * when querying the zone. - */ - page = pfn_to_page(start_pfn); - if (altmap) - page += vmem_altmap_offset(altmap); - __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0f01c7b1d217..08bbf648827b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn); - struct zone *zone; + struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); + struct zone *zone = page_zone(page); - /* With altmap the first mapped page is offset from @start */ - if (altmap) - page += vmem_altmap_offset(altmap); - zone = page_zone(page); __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0f81fc56bbfd..55fb6b7433ed 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) if (offset < reserve) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); - pgmap->altmap_valid = false; } else if (nd_pfn->mode == PFN_MODE_PMEM) { nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) - offset) / PAGE_SIZE); @@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) memcpy(altmap, &__altmap, sizeof(*altmap)); altmap->free = PHYS_PFN(offset - reserve); altmap->alloc = 0; - pgmap->altmap_valid = true; + pgmap->flags |= PGMAP_ALTMAP_VALID; } else return -ENXIO; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 093408ce40ad..e7d8cc9f41e8 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -412,7 +412,6 @@ static int pmem_attach_disk(struct device *dev, bb_res.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); - pmem->pgmap.altmap_valid = false; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 336eca601dad..e25685b878e9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -88,6 +88,8 @@ struct dev_pagemap_ops { vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); }; +#define PGMAP_ALTMAP_VALID (1 << 0) + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations @@ -96,19 +98,27 @@ struct dev_pagemap_ops { * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table */ struct dev_pagemap { struct vmem_altmap altmap; - bool altmap_valid; struct resource res; struct percpu_ref *ref; struct device *dev; enum memory_type type; + unsigned int flags; u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; }; +static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) +{ + if (pgmap->flags & PGMAP_ALTMAP_VALID) + return &pgmap->altmap; + return NULL; +} + #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/kernel/memremap.c b/kernel/memremap.c index 6c3dbb692037..eee490e7d7e1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -54,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; - struct vmem_altmap *altmap = &pgmap->altmap; - unsigned long pfn; - - pfn = res->start >> PAGE_SHIFT; - if (pgmap->altmap_valid) - pfn += vmem_altmap_offset(altmap); - return pfn; + return (pgmap->res.start >> PAGE_SHIFT) + + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) @@ -109,7 +103,7 @@ static void devm_memremap_pages_release(void *data) align_size >> PAGE_SHIFT, NULL); } else { arch_remove_memory(nid, align_start, align_size, - pgmap->altmap_valid ? &pgmap->altmap : NULL); + pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(align_start), align_size); } mem_hotplug_done(); @@ -129,8 +123,8 @@ static void devm_memremap_pages_release(void *data) * 1/ At a minimum the res, ref and type and ops members of @pgmap must be * initialized by the caller before passing it to this function * - * 2/ The altmap field may optionally be initialized, in which case altmap_valid - * must be set to true + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped * at devm_memremap_pages_release() time, or if this routine fails. @@ -142,15 +136,13 @@ static void devm_memremap_pages_release(void *data) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; - struct vmem_altmap *altmap = pgmap->altmap_valid ? - &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap */ - .altmap = altmap, + .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -274,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); + align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); } mem_hotplug_done(); @@ -319,7 +311,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ - return altmap->reserve + altmap->free; + if (altmap) + return altmap->reserve + altmap->free; + return 0; } void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) diff --git a/mm/hmm.c b/mm/hmm.c index 36e25cdbdac1..e4470462298f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1442,7 +1442,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; devmem->pagemap.ops = &hmm_pagemap_ops; - devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; result = devm_memremap_pages(devmem->device, &devmem->pagemap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e096c987d261..6166ba5a15f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, int sections_to_remove; /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) { - if (altmap) - map_offset = vmem_altmap_offset(altmap); - } + if (is_dev_zone(zone)) + map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d66bc8abe0af..17a39d40a556 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5853,6 +5853,7 @@ void __ref memmap_init_zone_device(struct zone *zone, { unsigned long pfn, end_pfn = start_pfn + size; struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -5865,9 +5866,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ - if (pgmap->altmap_valid) { - struct vmem_altmap *altmap = &pgmap->altmap; - + if (altmap) { start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); size = end_pfn - start_pfn; } -- cgit v1.2.3-55-g7522 From 3876d4a38ae22bb56311abf5ea418eac46090c00 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 27 Jun 2019 15:00:11 -0700 Subject: x86, arm64: Move ARCH_WANT_HUGE_PMD_SHARE config in arch/Kconfig ARCH_WANT_HUGE_PMD_SHARE config was declared in both architectures: move this declaration in arch/Kconfig and make those architectures select it. Signed-off-by: Alexandre Ghiti Reviewed-by: Palmer Dabbelt Acked-by: Ingo Molnar Acked-by: Catalin Marinas # for arm64 Reviewed-by: Hanjun Guo Reviewed-by: Christoph Hellwig Signed-off-by: Paul Walmsley --- arch/Kconfig | 3 +++ arch/arm64/Kconfig | 2 +- arch/x86/Kconfig | 4 +--- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index c47b328eada0..d2f212dc8e72 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -577,6 +577,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD config HAVE_ARCH_HUGE_VMAP bool +config ARCH_WANT_HUGE_PMD_SHARE + bool + config HAVE_ARCH_SOFT_DIRTY bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 697ea0510729..56c6d5f30e2b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -71,6 +71,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER @@ -902,7 +903,6 @@ config SYS_SUPPORTS_HUGETLBFS def_bool y config ARCH_WANT_HUGE_PMD_SHARE - def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) config ARCH_HAS_CACHE_LINE_SIZE def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..fa021ec38803 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 @@ -301,9 +302,6 @@ config ARCH_HIBERNATION_POSSIBLE config ARCH_SUSPEND_POSSIBLE def_bool y -config ARCH_WANT_HUGE_PMD_SHARE - def_bool y - config ARCH_WANT_GENERAL_HUGETLB def_bool y -- cgit v1.2.3-55-g7522 From e9a1379f9219be439f47a0f063431a92dc529eda Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 12 Jul 2019 19:15:55 +0900 Subject: x86/vdso: Fix flip/flop vdso build bug Two consecutive "make" on an already compiled kernel tree will show different behavior: $ make CALL scripts/checksyscalls.sh CALL scripts/atomic/check-atomics.sh DESCEND objtool CHK include/generated/compile.h VDSOCHK arch/x86/entry/vdso/vdso64.so.dbg VDSOCHK arch/x86/entry/vdso/vdso32.so.dbg Kernel: arch/x86/boot/bzImage is ready (#3) Building modules, stage 2. MODPOST 12 modules $ make make CALL scripts/checksyscalls.sh CALL scripts/atomic/check-atomics.sh DESCEND objtool CHK include/generated/compile.h VDSO arch/x86/entry/vdso/vdso64.so.dbg OBJCOPY arch/x86/entry/vdso/vdso64.so VDSO2C arch/x86/entry/vdso/vdso-image-64.c CC arch/x86/entry/vdso/vdso-image-64.o VDSO arch/x86/entry/vdso/vdso32.so.dbg OBJCOPY arch/x86/entry/vdso/vdso32.so VDSO2C arch/x86/entry/vdso/vdso-image-32.c CC arch/x86/entry/vdso/vdso-image-32.o AR arch/x86/entry/vdso/built-in.a AR arch/x86/entry/built-in.a AR arch/x86/built-in.a GEN .version CHK include/generated/compile.h UPD include/generated/compile.h CC init/version.o AR init/built-in.a LD vmlinux.o This is causing "LD vmlinux" once every two times even without any modifications. This is the same bug fixed in commit 92a4728608a8 ("x86/boot: Fix if_changed build flip/flop bug"). Two "if_changed" cannot be used in one target. Fix this merging two commands into one function. Fixes: 7ac870747988 ("x86/vdso: Switch to generic vDSO implementation") Signed-off-by: Naohiro Aota Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Reviewed-by: Vincenzo Frascino Reviewed-by: Masahiro Yamada Link: https://lkml.kernel.org/r/20190712101556.17833-1-naohiro.aota@wdc.com --- arch/x86/entry/vdso/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 39106111be86..34773395139a 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -56,8 +56,7 @@ VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ -z max-page-size=4096 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE - $(call if_changed,vdso) - $(call if_changed,vdso_check) + $(call if_changed,vdso_and_check) HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi hostprogs-y += vdso2c @@ -127,8 +126,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE - $(call if_changed,vdso) - $(call if_changed,vdso_check) + $(call if_changed,vdso_and_check) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 @@ -167,8 +165,7 @@ $(obj)/vdso32.so.dbg: FORCE \ $(obj)/vdso32/note.o \ $(obj)/vdso32/system_call.o \ $(obj)/vdso32/sigreturn.o - $(call if_changed,vdso) - $(call if_changed,vdso_check) + $(call if_changed,vdso_and_check) # # The DSO images are built using a special linker script. @@ -184,6 +181,9 @@ VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ -Bsymbolic GCOV_PROFILE := n +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check) + # # Install the unstripped copies of vdso*.so. If our toolchain supports # build-id, install .build-id links as well. -- cgit v1.2.3-55-g7522 From 16f4641166b10e199f0d7b68c2c5f004fef0bda3 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Fri, 28 Jun 2019 21:59:20 +0000 Subject: perf/x86/amd/uncore: Do not set 'ThreadMask' and 'SliceMask' for non-L3 PMCs The following commit: d7cbbe49a930 ("perf/x86/amd/uncore: Set ThreadMask and SliceMask for L3 Cache perf events") enables L3 PMC events for all threads and slices by writing 1's in 'ChL3PmcCfg' (L3 PMC PERF_CTL) register fields. Those bitfields overlap with high order event select bits in the Data Fabric PMC control register, however. So when a user requests raw Data Fabric events (-e amd_df/event=0xYYY/), the two highest order bits get inadvertently set, changing the counter select to events that don't exist, and for which no counts are read. This patch changes the logic to write the L3 masks only when dealing with L3 PMC counters. AMD Family 16h and below Northbridge (NB) counters were not affected. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Gary Hook Cc: H. Peter Anvin Cc: Janakarajan Natarajan Cc: Jiri Olsa Cc: Linus Torvalds Cc: Martin Liska Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Pu Wen Cc: Stephane Eranian Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: Vince Weaver Fixes: d7cbbe49a930 ("perf/x86/amd/uncore: Set ThreadMask and SliceMask for L3 Cache perf events") Link: https://lkml.kernel.org/r/20190628215906.4276-1-kim.phillips@amd.com Signed-off-by: Ingo Molnar --- arch/x86/events/amd/uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 85e6984c560b..c2c4ae5fbbfc 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -206,7 +206,7 @@ static int amd_uncore_event_init(struct perf_event *event) * SliceMask and ThreadMask need to be set for certain L3 events in * Family 17h. For other events, the two fields do not affect the count. */ - if (l3_mask) + if (l3_mask && is_llc_event(event)) hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK); if (event->cpu < 0) -- cgit v1.2.3-55-g7522 From 2f217d58a8a086d3399fecce39fb358848e799c4 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Fri, 28 Jun 2019 21:59:33 +0000 Subject: perf/x86/amd/uncore: Set the thread mask for F17h L3 PMCs Fill in the L3 performance event select register ThreadMask bitfield, to enable per hardware thread accounting. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Gary Hook Cc: H. Peter Anvin Cc: Janakarajan Natarajan Cc: Jiri Olsa Cc: Linus Torvalds Cc: Martin Liska Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Pu Wen Cc: Stephane Eranian Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20190628215906.4276-2-kim.phillips@amd.com Signed-off-by: Ingo Molnar --- arch/x86/events/amd/uncore.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index c2c4ae5fbbfc..a6ea07f2aa84 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -202,15 +202,22 @@ static int amd_uncore_event_init(struct perf_event *event) hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; + if (event->cpu < 0) + return -EINVAL; + /* * SliceMask and ThreadMask need to be set for certain L3 events in * Family 17h. For other events, the two fields do not affect the count. */ - if (l3_mask && is_llc_event(event)) - hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK); + if (l3_mask && is_llc_event(event)) { + int thread = 2 * (cpu_data(event->cpu).cpu_core_id % 4); - if (event->cpu < 0) - return -EINVAL; + if (smp_num_siblings > 1) + thread += cpu_data(event->cpu).apicid & 1; + + hwc->config |= (1ULL << (AMD64_L3_THREAD_SHIFT + thread) & + AMD64_L3_THREAD_MASK) | AMD64_L3_SLICE_MASK; + } uncore = event_to_amd_uncore(event); if (!uncore) -- cgit v1.2.3-55-g7522 From e4557c1a46b0d32746bd309e1941914b5a6912b4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 25 Jun 2019 07:21:35 -0700 Subject: perf/x86/intel: Fix spurious NMI on fixed counter If a user first sample a PEBS event on a fixed counter, then sample a non-PEBS event on the same fixed counter on Icelake, it will trigger spurious NMI. For example: perf record -e 'cycles:p' -a perf record -e 'cycles' -a The error message for spurious NMI: [June 21 15:38] Uhhuh. NMI received for unknown reason 30 on CPU 2. [ +0.000000] Do you have a strange power saving mode enabled? [ +0.000000] Dazed and confused, but trying to continue The bug was introduced by the following commit: commit 6f55967ad9d9 ("perf/x86/intel: Fix race in intel_pmu_disable_event()") The commit moves the intel_pmu_pebs_disable() after intel_pmu_disable_fixed(), which returns immediately. The related bit of PEBS_ENABLE MSR will never be cleared for the fixed counter. Then a non-PEBS event runs on the fixed counter, but the bit on PEBS_ENABLE is still set, which triggers spurious NMIs. Check and disable PEBS for fixed counters after intel_pmu_disable_fixed(). Reported-by: Yi, Ammy Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jiri Olsa Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 6f55967ad9d9 ("perf/x86/intel: Fix race in intel_pmu_disable_event()") Link: https://lkml.kernel.org/r/20190625142135.22112-1-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bda450ff51ee..9e911a96972b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2161,12 +2161,10 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) intel_pmu_disable_fixed(hwc); - return; - } - - x86_pmu_disable_event(event); + else + x86_pmu_disable_event(event); /* * Needs to be called after x86_pmu_disable_event, -- cgit v1.2.3-55-g7522 From 028b6e8a89de9133a869bb4cd1bc72445b1ec8ca Mon Sep 17 00:00:00 2001 From: Dmitry V. Levin Date: Sun, 14 Jul 2019 19:20:47 +0300 Subject: clone: fix CLONE_PIDFD support The introduction of clone3 syscall accidentally broke CLONE_PIDFD support in traditional clone syscall on compat x86 and those architectures that use do_fork to implement clone syscall. This bug was found by strace test suite. Link: https://strace.io/logs/strace/2019-07-12 Fixes: 7f192e3cd316 ("fork: add clone3") Bisected-and-tested-by: Anatoly Pugachev Signed-off-by: Dmitry V. Levin Link: https://lore.kernel.org/r/20190714162047.GB10389@altlinux.org Signed-off-by: Christian Brauner --- arch/x86/ia32/sys_ia32.c | 4 ++++ include/linux/sched/task.h | 1 + kernel/fork.c | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 64a6c952091e..21790307121e 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -239,6 +239,7 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, { struct kernel_clone_args args = { .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, .exit_signal = (clone_flags & CSIGNAL), @@ -246,5 +247,8 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, .tls = tls_val, }; + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + return _do_fork(&args); } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 109a0df5af39..0497091e40c1 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -89,6 +89,7 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern long _do_fork(struct kernel_clone_args *kargs); +extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); diff --git a/kernel/fork.c b/kernel/fork.c index 8f3e2d97d771..ef1e05a68827 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2406,6 +2406,16 @@ long _do_fork(struct kernel_clone_args *args) return nr; } +bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) +{ + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((kargs->flags & CLONE_PIDFD) && + (kargs->flags & CLONE_PARENT_SETTID)) + return false; + + return true; +} + #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -2417,6 +2427,7 @@ long do_fork(unsigned long clone_flags, { struct kernel_clone_args args = { .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, .exit_signal = (clone_flags & CSIGNAL), @@ -2424,6 +2435,9 @@ long do_fork(unsigned long clone_flags, .stack_size = stack_size, }; + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + return _do_fork(&args); } #endif @@ -2505,8 +2519,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ - if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) + if (!legacy_clone_args_valid(&args)) return -EINVAL; return _do_fork(&args); -- cgit v1.2.3-55-g7522 From 330d48105245abfb8c9ca491dc53ea500657217a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 15:21:39 -0300 Subject: docs: admin-guide: add kdump documentation into it The Kdump documentation describes procedures with admins use in order to solve issues on their systems. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/bug-hunting.rst | 4 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kdump/gdbmacros.txt | 264 +++++++++++ Documentation/admin-guide/kdump/index.rst | 20 + Documentation/admin-guide/kdump/kdump.rst | 534 ++++++++++++++++++++++ Documentation/admin-guide/kdump/vmcoreinfo.rst | 488 ++++++++++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/kdump/gdbmacros.txt | 264 ----------- Documentation/kdump/index.rst | 21 - Documentation/kdump/kdump.rst | 534 ---------------------- Documentation/kdump/vmcoreinfo.rst | 488 -------------------- Documentation/powerpc/firmware-assisted-dump.txt | 2 +- Documentation/translations/zh_CN/oops-tracing.txt | 4 +- MAINTAINERS | 2 +- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/x86/Kconfig | 4 +- 18 files changed, 1321 insertions(+), 1321 deletions(-) create mode 100644 Documentation/admin-guide/kdump/gdbmacros.txt create mode 100644 Documentation/admin-guide/kdump/index.rst create mode 100644 Documentation/admin-guide/kdump/kdump.rst create mode 100644 Documentation/admin-guide/kdump/vmcoreinfo.rst delete mode 100644 Documentation/kdump/gdbmacros.txt delete mode 100644 Documentation/kdump/index.rst delete mode 100644 Documentation/kdump/kdump.rst delete mode 100644 Documentation/kdump/vmcoreinfo.rst (limited to 'arch/x86') diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index b761aa2a51d2..44b8a4edd348 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -90,9 +90,9 @@ the disk is not available then you have three options: run a null modem to a second machine and capture the output there using your favourite communication program. Minicom works well. -(3) Use Kdump (see Documentation/kdump/kdump.rst), +(3) Use Kdump (see Documentation/admin-guide/kdump/kdump.rst), extract the kernel ring buffer from old memory with using dmesg - gdbmacro in Documentation/kdump/gdbmacros.txt. + gdbmacro in Documentation/admin-guide/kdump/gdbmacros.txt. Finding the bug's location -------------------------- diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 6fcc83aaa9b6..5b63182ceb5f 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -39,6 +39,7 @@ problems and bugs in particular. ramoops dynamic-debug-howto init + kdump/index perf/index This is the beginning of a section with information of interest to diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt new file mode 100644 index 000000000000..220d0a80ca2c --- /dev/null +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -0,0 +1,264 @@ +# +# This file contains a few gdb macros (user defined commands) to extract +# useful information from kernel crashdump (kdump) like stack traces of +# all the processes or a particular process and trapinfo. +# +# These macros can be used by copying this file in .gdbinit (put in home +# directory or current directory) or by invoking gdb command with +# --command= option +# +# Credits: +# Alexander Nyberg +# V Srivatsa +# Maneesh Soni +# + +define bttnobp + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $stacksize = sizeof(union thread_union) + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm + printf "===================\n" + set var $stackp = $next_t.thread.sp + set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize + + while ($stackp < $stack_top) + if (*($stackp) > _stext && *($stackp) < _sinittext) + info symbol *($stackp) + end + set $stackp += 4 + end + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm + printf "===================\n" + set var $stackp = $next_t.thread.sp + set var $stack_top = ($stackp & ~($stacksize - 1)) + stacksize + + while ($stackp < $stack_top) + if (*($stackp) > _stext && *($stackp) < _sinittext) + info symbol *($stackp) + end + set $stackp += 4 + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end +end +document bttnobp + dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER +end + +define btthreadstack + set var $pid_task = $arg0 + + printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm + printf "task struct: " + print $pid_task + printf "===================\n" + set var $stackp = $pid_task.thread.sp + set var $stacksize = sizeof(union thread_union) + set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize + set var $stack_bot = ($stackp & ~($stacksize - 1)) + + set $stackp = *((unsigned long *) $stackp) + while (($stackp < $stack_top) && ($stackp > $stack_bot)) + set var $addr = *(((unsigned long *) $stackp) + 1) + info symbol $addr + set $stackp = *((unsigned long *) $stackp) + end +end +document btthreadstack + dump a thread stack using the given task structure pointer +end + + +define btt + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + btthreadstack $next_t + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + btthreadstack $next_th + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end +end +document btt + dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER +end + +define btpid + set var $pid = $arg0 + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $pid_task = 0 + + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + + if ($next_t.pid == $pid) + set $pid_task = $next_t + end + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + if ($next_th.pid == $pid) + set $pid_task = $next_th + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end + + btthreadstack $pid_task +end +document btpid + backtrace of pid +end + + +define trapinfo + set var $pid = $arg0 + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $pid_task = 0 + + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + + if ($next_t.pid == $pid) + set $pid_task = $next_t + end + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + if ($next_th.pid == $pid) + set $pid_task = $next_th + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end + + printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \ + $pid_task.thread.cr2, $pid_task.thread.error_code + +end +document trapinfo + Run info threads and lookup pid of thread #1 + 'trapinfo ' will tell you by which trap & possibly + address the kernel panicked. +end + +define dump_log_idx + set $idx = $arg0 + if ($argc > 1) + set $prev_flags = $arg1 + else + set $prev_flags = 0 + end + set $msg = ((struct printk_log *) (log_buf + $idx)) + set $prefix = 1 + set $newline = 1 + set $log = log_buf + $idx + sizeof(*$msg) + + # prev & LOG_CONT && !(msg->flags & LOG_PREIX) + if (($prev_flags & 8) && !($msg->flags & 4)) + set $prefix = 0 + end + + # msg->flags & LOG_CONT + if ($msg->flags & 8) + # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) + if (($prev_flags & 8) && !($prev_flags & 2)) + set $prefix = 0 + end + # (!(msg->flags & LOG_NEWLINE)) + if (!($msg->flags & 2)) + set $newline = 0 + end + end + + if ($prefix) + printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 + end + if ($msg->text_len != 0) + eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len + end + if ($newline) + printf "\n" + end + if ($msg->dict_len > 0) + set $dict = $log + $msg->text_len + set $idx = 0 + set $line = 1 + while ($idx < $msg->dict_len) + if ($line) + printf " " + set $line = 0 + end + set $c = $dict[$idx] + if ($c == '\0') + printf "\n" + set $line = 1 + else + if ($c < ' ' || $c >= 127 || $c == '\\') + printf "\\x%02x", $c + else + printf "%c", $c + end + end + set $idx = $idx + 1 + end + printf "\n" + end +end +document dump_log_idx + Dump a single log given its index in the log buffer. The first + parameter is the index into log_buf, the second is optional and + specified the previous log buffer's flags, used for properly + formatting continued lines. +end + +define dmesg + set $i = log_first_idx + set $end_idx = log_first_idx + set $prev_flags = 0 + + while (1) + set $msg = ((struct printk_log *) (log_buf + $i)) + if ($msg->len == 0) + set $i = 0 + else + dump_log_idx $i $prev_flags + set $i = $i + $msg->len + set $prev_flags = $msg->flags + end + if ($i == $end_idx) + loop_break + end + end +end +document dmesg + print the kernel ring buffer +end diff --git a/Documentation/admin-guide/kdump/index.rst b/Documentation/admin-guide/kdump/index.rst new file mode 100644 index 000000000000..8e2ebd0383cd --- /dev/null +++ b/Documentation/admin-guide/kdump/index.rst @@ -0,0 +1,20 @@ + +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +.. toctree:: + :maxdepth: 1 + + kdump + vmcoreinfo + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst new file mode 100644 index 000000000000..ac7e131d2935 --- /dev/null +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -0,0 +1,534 @@ +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +Overview +======== + +Kdump uses kexec to quickly boot to a dump-capture kernel whenever a +dump of the system kernel's memory needs to be taken (for example, when +the system panics). The system kernel's memory image is preserved across +the reboot and is accessible to the dump-capture kernel. + +You can use common commands, such as cp and scp, to copy the +memory image to a dump file on the local disk, or across the network to +a remote system. + +Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, +s390x, arm and arm64 architectures. + +When the system kernel boots, it reserves a small section of memory for +the dump-capture kernel. This ensures that ongoing Direct Memory Access +(DMA) from the system kernel does not corrupt the dump-capture kernel. +The kexec -p command loads the dump-capture kernel into this reserved +memory. + +On x86 machines, the first 640 KB of physical memory is needed to boot, +regardless of where the kernel loads. Therefore, kexec backs up this +region just before rebooting into the dump-capture kernel. + +Similarly on PPC64 machines first 32KB of physical memory is needed for +booting regardless of where the kernel is loaded and to support 64K page +size kexec backs up the first 64KB memory. + +For s390x, when kdump is triggered, the crashkernel region is exchanged +with the region [0, crashkernel region size] and then the kdump kernel +runs in [0, crashkernel region size]. Therefore no relocatable kernel is +needed for s390x. + +All of the necessary information about the system kernel's core image is +encoded in the ELF format, and stored in a reserved area of memory +before a crash. The physical address of the start of the ELF header is +passed to the dump-capture kernel through the elfcorehdr= boot +parameter. Optionally the size of the ELF header can also be passed +when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax. + + +With the dump-capture kernel, you can access the memory image through +/proc/vmcore. This exports the dump as an ELF-format file that you can +write out using file copy commands such as cp or scp. Further, you can +use analysis tools such as the GNU Debugger (GDB) and the Crash tool to +debug the dump file. This method ensures that the dump pages are correctly +ordered. + + +Setup and Installation +====================== + +Install kexec-tools +------------------- + +1) Login as the root user. + +2) Download the kexec-tools user-space package from the following URL: + +http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz + +This is a symlink to the latest version. + +The latest kexec-tools git tree is available at: + +- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git +- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git + +There is also a gitweb interface available at +http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git + +More information about kexec-tools can be found at +http://horms.net/projects/kexec/ + +3) Unpack the tarball with the tar command, as follows:: + + tar xvpzf kexec-tools.tar.gz + +4) Change to the kexec-tools directory, as follows:: + + cd kexec-tools-VERSION + +5) Configure the package, as follows:: + + ./configure + +6) Compile the package, as follows:: + + make + +7) Install the package, as follows:: + + make install + + +Build the system and dump-capture kernels +----------------------------------------- +There are two possible methods of using Kdump. + +1) Build a separate custom dump-capture kernel for capturing the + kernel core dump. + +2) Or use the system kernel binary itself as dump-capture kernel and there is + no need to build a separate dump-capture kernel. This is possible + only with the architectures which support a relocatable kernel. As + of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support + relocatable kernel. + +Building a relocatable kernel is advantageous from the point of view that +one does not have to build a second kernel for capturing the dump. But +at the same time one might want to build a custom dump capture kernel +suitable to his needs. + +Following are the configuration setting required for system and +dump-capture kernels for enabling kdump support. + +System kernel config options +---------------------------- + +1) Enable "kexec system call" in "Processor type and features.":: + + CONFIG_KEXEC=y + +2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo + filesystems." This is usually enabled by default:: + + CONFIG_SYSFS=y + + Note that "sysfs file system support" might not appear in the "Pseudo + filesystems" menu if "Configure standard kernel features (for small + systems)" is not enabled in "General Setup." In this case, check the + .config file itself to ensure that sysfs is turned on, as follows:: + + grep 'CONFIG_SYSFS' .config + +3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: + + CONFIG_DEBUG_INFO=Y + + This causes the kernel to be built with debug symbols. The dump + analysis tools require a vmlinux with debug symbols in order to read + and analyze a dump file. + +Dump-capture kernel config options (Arch Independent) +----------------------------------------------------- + +1) Enable "kernel crash dumps" support under "Processor type and + features":: + + CONFIG_CRASH_DUMP=y + +2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: + + CONFIG_PROC_VMCORE=y + + (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) + +Dump-capture kernel config options (Arch Dependent, i386 and x86_64) +-------------------------------------------------------------------- + +1) On i386, enable high memory support under "Processor type and + features":: + + CONFIG_HIGHMEM64G=y + + or:: + + CONFIG_HIGHMEM4G + +2) On i386 and x86_64, disable symmetric multi-processing support + under "Processor type and features":: + + CONFIG_SMP=n + + (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line + when loading the dump-capture kernel, see section "Load the Dump-capture + Kernel".) + +3) If one wants to build and use a relocatable kernel, + Enable "Build a relocatable kernel" support under "Processor type and + features":: + + CONFIG_RELOCATABLE=y + +4) Use a suitable value for "Physical address where the kernel is + loaded" (under "Processor type and features"). This only appears when + "kernel crash dumps" is enabled. A suitable value depends upon + whether kernel is relocatable or not. + + If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000 + This will compile the kernel for physical address 1MB, but given the fact + kernel is relocatable, it can be run from any physical address hence + kexec boot loader will load it in memory region reserved for dump-capture + kernel. + + Otherwise it should be the start of memory region reserved for + second kernel using boot parameter "crashkernel=Y@X". Here X is + start of memory region reserved for dump-capture kernel. + Generally X is 16MB (0x1000000). So you can set + CONFIG_PHYSICAL_START=0x1000000 + +5) Make and install the kernel and its modules. DO NOT add this kernel + to the boot loader configuration files. + +Dump-capture kernel config options (Arch Dependent, ppc64) +---------------------------------------------------------- + +1) Enable "Build a kdump crash kernel" support under "Kernel" options:: + + CONFIG_CRASH_DUMP=y + +2) Enable "Build a relocatable kernel" support:: + + CONFIG_RELOCATABLE=y + + Make and install the kernel and its modules. + +Dump-capture kernel config options (Arch Dependent, ia64) +---------------------------------------------------------- + +- No specific options are required to create a dump-capture kernel + for ia64, other than those specified in the arch independent section + above. This means that it is possible to use the system kernel + as a dump-capture kernel if desired. + + The crashkernel region can be automatically placed by the system + kernel at run time. This is done by specifying the base address as 0, + or omitting it all together:: + + crashkernel=256M@0 + + or:: + + crashkernel=256M + + If the start address is specified, note that the start address of the + kernel will be aligned to 64Mb, so if the start address is not then + any space below the alignment point will be wasted. + +Dump-capture kernel config options (Arch Dependent, arm) +---------------------------------------------------------- + +- To use a relocatable kernel, + Enable "AUTO_ZRELADDR" support under "Boot" options:: + + AUTO_ZRELADDR=y + +Dump-capture kernel config options (Arch Dependent, arm64) +---------------------------------------------------------- + +- Please note that kvm of the dump-capture kernel will not be enabled + on non-VHE systems even if it is configured. This is because the CPU + will not be reset to EL2 on panic. + +Extended crashkernel syntax +=========================== + +While the "crashkernel=size[@offset]" syntax is sufficient for most +configurations, sometimes it's handy to have the reserved memory dependent +on the value of System RAM -- that's mostly for distributors that pre-setup +the kernel command line to avoid a unbootable system after some memory has +been removed from the machine. + +The syntax is:: + + crashkernel=:[,:,...][@offset] + range=start-[end] + +For example:: + + crashkernel=512M-2G:64M,2G-:128M + +This would mean: + + 1) if the RAM is smaller than 512M, then don't reserve anything + (this is the "rescue" case) + 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M + 3) if the RAM size is larger than 2G, then reserve 128M + + + +Boot into System Kernel +======================= + +1) Update the boot loader (such as grub, yaboot, or lilo) configuration + files as necessary. + +2) Boot the system kernel with the boot parameter "crashkernel=Y@X", + where Y specifies how much memory to reserve for the dump-capture kernel + and X specifies the beginning of this reserved memory. For example, + "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory + starting at physical address 0x01000000 (16MB) for the dump-capture kernel. + + On x86 and x86_64, use "crashkernel=64M@16M". + + On ppc64, use "crashkernel=128M@32M". + + On ia64, 256M@256M is a generous value that typically works. + The region may be automatically placed on ia64, see the + dump-capture kernel config option notes above. + If use sparse memory, the size should be rounded to GRANULE boundaries. + + On s390x, typically use "crashkernel=xxM". The value of xx is dependent + on the memory consumption of the kdump system. In general this is not + dependent on the memory size of the production system. + + On arm, the use of "crashkernel=Y@X" is no longer necessary; the + kernel will automatically locate the crash kernel image within the + first 512MB of RAM if X is not given. + + On arm64, use "crashkernel=Y[@X]". Note that the start address of + the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). + +Load the Dump-capture Kernel +============================ + +After booting to the system kernel, dump-capture kernel needs to be +loaded. + +Based on the architecture and type of image (relocatable or not), one +can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz +of dump-capture kernel. Following is the summary. + +For i386 and x86_64: + + - Use vmlinux if kernel is not relocatable. + - Use bzImage/vmlinuz if kernel is relocatable. + +For ppc64: + + - Use vmlinux + +For ia64: + + - Use vmlinux or vmlinuz.gz + +For s390x: + + - Use image or bzImage + +For arm: + + - Use zImage + +For arm64: + + - Use vmlinux or Image + +If you are using an uncompressed vmlinux image then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= --args-linux \ + --append="root= " + +If you are using a compressed bzImage/vmlinuz, then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= \ + --append="root= " + +If you are using a compressed zImage, then use following command +to load dump-capture kernel:: + + kexec --type zImage -p \ + --initrd= \ + --dtb= \ + --append="root= " + +If you are using an uncompressed Image, then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= \ + --append="root= " + +Please note, that --args-linux does not need to be specified for ia64. +It is planned to make this a no-op on that architecture, but for now +it should be omitted + +Following are the arch specific command line options to be used while +loading dump-capture kernel. + +For i386, x86_64 and ia64: + + "1 irqpoll maxcpus=1 reset_devices" + +For ppc64: + + "1 maxcpus=1 noirqdistrib reset_devices" + +For s390x: + + "1 maxcpus=1 cgroup_disable=memory" + +For arm: + + "1 maxcpus=1 reset_devices" + +For arm64: + + "1 maxcpus=1 reset_devices" + +Notes on loading the dump-capture kernel: + +* By default, the ELF headers are stored in ELF64 format to support + systems with more than 4GB memory. On i386, kexec automatically checks if + the physical RAM size exceeds the 4 GB limit and if not, uses ELF32. + So, on non-PAE systems, ELF32 is always used. + + The --elf32-core-headers option can be used to force the generation of ELF32 + headers. This is necessary because GDB currently cannot open vmcore files + with ELF64 headers on 32-bit systems. + +* The "irqpoll" boot parameter reduces driver initialization failures + due to shared interrupts in the dump-capture kernel. + +* You must specify in the format corresponding to the root + device name in the output of mount command. + +* Boot parameter "1" boots the dump-capture kernel into single-user + mode without networking. If you want networking, use "3". + +* We generally don't have to bring up a SMP kernel just to capture the + dump. Hence generally it is useful either to build a UP dump-capture + kernel or specify maxcpus=1 option while loading dump-capture kernel. + Note, though maxcpus always works, you had better replace it with + nr_cpus to save memory if supported by the current ARCH, such as x86. + +* You should enable multi-cpu support in dump-capture kernel if you intend + to use multi-thread programs with it, such as parallel dump feature of + makedumpfile. Otherwise, the multi-thread program may have a great + performance degradation. To enable multi-cpu support, you should bring up an + SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] + options while loading it. + +* For s390x there are two kdump modes: If a ELF header is specified with + the elfcorehdr= kernel parameter, it is used by the kdump kernel as it + is done on all other architectures. If no elfcorehdr= kernel parameter is + specified, the s390x kdump kernel dynamically creates the header. The + second mode has the advantage that for CPU and memory hotplug, kdump has + not to be reloaded with kexec_load(). + +* For s390x systems with many attached devices the "cio_ignore" kernel + parameter should be used for the kdump kernel in order to prevent allocation + of kernel memory for devices that are not relevant for kdump. The same + applies to systems that use SCSI/FCP devices. In that case the + "allow_lun_scan" zfcp module parameter should be set to zero before + setting FCP devices online. + +Kernel Panic +============ + +After successfully loading the dump-capture kernel as previously +described, the system will reboot into the dump-capture kernel if a +system crash is triggered. Trigger points are located in panic(), +die(), die_nmi() and in the sysrq handler (ALT-SysRq-c). + +The following conditions will execute a crash trigger point: + +If a hard lockup is detected and "NMI watchdog" is configured, the system +will boot into the dump-capture kernel ( die_nmi() ). + +If die() is called, and it happens to be a thread with pid 0 or 1, or die() +is called inside interrupt context or die() is called and panic_on_oops is set, +the system will boot into the dump-capture kernel. + +On powerpc systems when a soft-reset is generated, die() is called by all cpus +and the system will boot into the dump-capture kernel. + +For testing purposes, you can trigger a crash by using "ALT-SysRq-c", +"echo c > /proc/sysrq-trigger" or write a module to force the panic. + +Write Out the Dump File +======================= + +After the dump-capture kernel is booted, write out the dump file with +the following command:: + + cp /proc/vmcore + + +Analysis +======== + +Before analyzing the dump image, you should reboot into a stable kernel. + +You can do limited analysis using GDB on the dump file copied out of +/proc/vmcore. Use the debug vmlinux built with -g and run the following +command:: + + gdb vmlinux + +Stack trace for the task on processor 0, register display, and memory +display work fine. + +Note: GDB cannot analyze core files generated in ELF64 format for x86. +On systems with a maximum of 4GB of memory, you can generate +ELF32-format headers using the --elf32-core-headers kernel option on the +dump kernel. + +You can also use the Crash utility to analyze dump files in Kdump +format. Crash is available on Dave Anderson's site at the following URL: + + http://people.redhat.com/~anderson/ + +Trigger Kdump on WARN() +======================= + +The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This +will cause a kdump to occur at the panic() call. In cases where a user wants +to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 +to achieve the same behaviour. + +Contact +======= + +- Vivek Goyal (vgoyal@redhat.com) +- Maneesh Soni (maneesh@in.ibm.com) + +GDB macros +========== + +.. include:: gdbmacros.txt + :literal: diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst new file mode 100644 index 000000000000..007a6b86e0ee --- /dev/null +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -0,0 +1,488 @@ +========== +VMCOREINFO +========== + +What is it? +=========== + +VMCOREINFO is a special ELF note section. It contains various +information from the kernel like structure size, page size, symbol +values, field offsets, etc. These data are packed into an ELF note +section and used by user-space tools like crash and makedumpfile to +analyze a kernel's memory layout. + +Common variables +================ + +init_uts_ns.name.release +------------------------ + +The version of the Linux kernel. Used to find the corresponding source +code from which the kernel has been built. For example, crash uses it to +find the corresponding vmlinux in order to process vmcore. + +PAGE_SIZE +--------- + +The size of a page. It is the smallest unit of data used by the memory +management facilities. It is usually 4096 bytes of size and a page is +aligned on 4096 bytes. Used for computing page addresses. + +init_uts_ns +----------- + +The UTS namespace which is used to isolate two specific elements of the +system that relate to the uname(2) system call. It is named after the +data structure used to store information returned by the uname(2) system +call. + +User-space tools can get the kernel name, host name, kernel release +number, kernel version, architecture name and OS type from it. + +node_online_map +--------------- + +An array node_states[N_ONLINE] which represents the set of online nodes +in a system, one bit position per node number. Used to keep track of +which nodes are in the system and online. + +swapper_pg_dir +-------------- + +The global page directory pointer of the kernel. Used to translate +virtual to physical addresses. + +_stext +------ + +Defines the beginning of the text section. In general, _stext indicates +the kernel start address. Used to convert a virtual address from the +direct kernel map to a physical address. + +vmap_area_list +-------------- + +Stores the virtual area list. makedumpfile gets the vmalloc start value +from this variable and its value is necessary for vmalloc translation. + +mem_map +------- + +Physical addresses are translated to struct pages by treating them as +an index into the mem_map array. Right-shifting a physical address +PAGE_SHIFT bits converts it into a page frame number which is an index +into that mem_map array. + +Used to map an address to the corresponding struct page. + +contig_page_data +---------------- + +Makedumpfile gets the pglist_data structure from this symbol, which is +used to describe the memory layout. + +User-space tools use this to exclude free pages when dumping memory. + +mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map) +-------------------------------------------------------------------------- + +The address of the mem_section array, its length, structure size, and +the section_mem_map offset. + +It exists in the sparse memory mapping model, and it is also somewhat +similar to the mem_map variable, both of them are used to translate an +address. + +page +---- + +The size of a page structure. struct page is an important data structure +and it is widely used to compute contiguous memory. + +pglist_data +----------- + +The size of a pglist_data structure. This value is used to check if the +pglist_data structure is valid. It is also used for checking the memory +type. + +zone +---- + +The size of a zone structure. This value is used to check if the zone +structure has been found. It is also used for excluding free pages. + +free_area +--------- + +The size of a free_area structure. It indicates whether the free_area +structure is valid or not. Useful when excluding free pages. + +list_head +--------- + +The size of a list_head structure. Used when iterating lists in a +post-mortem analysis session. + +nodemask_t +---------- + +The size of a nodemask_t type. Used to compute the number of online +nodes. + +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) +------------------------------------------------------------------------------------------------- + +User-space tools compute their values based on the offset of these +variables. The variables are used when excluding unnecessary pages. + +(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) +----------------------------------------------------------------------------------------- + +On NUMA machines, each NUMA node has a pg_data_t to describe its memory +layout. On UMA machines there is a single pglist_data which describes the +whole memory. + +These values are used to check the memory type and to compute the +virtual address for memory map. + +(zone, free_area|vm_stat|spanned_pages) +--------------------------------------- + +Each node is divided into a number of blocks called zones which +represent ranges within memory. A zone is described by a structure zone. + +User-space tools compute required values based on the offset of these +variables. + +(free_area, free_list) +---------------------- + +Offset of the free_list's member. This value is used to compute the number +of free pages. + +Each zone has a free_area structure array called free_area[MAX_ORDER]. +The free_list represents a linked list of free page blocks. + +(list_head, next|prev) +---------------------- + +Offsets of the list_head's members. list_head is used to define a +circular linked list. User-space tools need these in order to traverse +lists. + +(vmap_area, va_start|list) +-------------------------- + +Offsets of the vmap_area's members. They carry vmalloc-specific +information. Makedumpfile gets the start address of the vmalloc region +from this. + +(zone.free_area, MAX_ORDER) +--------------------------- + +Free areas descriptor. User-space tools use this value to iterate the +free_area ranges. MAX_ORDER is used by the zone buddy allocator. + +log_first_idx +------------- + +Index of the first record stored in the buffer log_buf. Used by +user-space tools to read the strings in the log_buf. + +log_buf +------- + +Console output is written to the ring buffer log_buf at index +log_first_idx. Used to get the kernel log. + +log_buf_len +----------- + +log_buf's length. + +clear_idx +--------- + +The index that the next printk() record to read after the last clear +command. It indicates the first record after the last SYSLOG_ACTION +_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump +the dmesg log. + +log_next_idx +------------ + +The index of the next record to store in the buffer log_buf. Used to +compute the index of the current buffer position. + +printk_log +---------- + +The size of a structure printk_log. Used to compute the size of +messages, and extract dmesg log. It encapsulates header information for +log_buf, such as timestamp, syslog level, etc. + +(printk_log, ts_nsec|len|text_len|dict_len) +------------------------------------------- + +It represents field offsets in struct printk_log. User space tools +parse it and check whether the values of printk_log's members have been +changed. + +(free_area.free_list, MIGRATE_TYPES) +------------------------------------ + +The number of migrate types for pages. The free_list is described by the +array. Used by tools to compute the number of free pages. + +NR_FREE_PAGES +------------- + +On linux-2.6.21 or later, the number of free pages is in +vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. + +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask +------------------------------------------------------------------------------ + +Page attributes. These flags are used to filter various unnecessary for +dumping pages. + +PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) +----------------------------------------------------------------------------- + +More page attributes. These flags are used to filter various unnecessary for +dumping pages. + + +HUGETLB_PAGE_DTOR +----------------- + +The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile +excludes these pages. + +x86_64 +====== + +phys_base +--------- + +Used to convert the virtual address of an exported kernel symbol to its +corresponding physical address. + +init_top_pgt +------------ + +Used to walk through the whole page table and convert virtual addresses +to physical addresses. The init_top_pgt is somewhat similar to +swapper_pg_dir, but it is only used in x86_64. + +pgtable_l5_enabled +------------------ + +User-space tools need to know whether the crash kernel was in 5-level +paging mode. + +node_data +--------- + +This is a struct pglist_data array and stores all NUMA nodes +information. Makedumpfile gets the pglist_data structure from it. + +(node_data, MAX_NUMNODES) +------------------------- + +The maximum number of nodes in system. + +KERNELOFFSET +------------ + +The kernel randomization offset. Used to compute the page offset. If +KASLR is disabled, this value is zero. + +KERNEL_IMAGE_SIZE +----------------- + +Currently unused by Makedumpfile. Used to compute the module virtual +address by Crash. + +sme_mask +-------- + +AMD-specific with SME support: it indicates the secure memory encryption +mask. Makedumpfile tools need to know whether the crash kernel was +encrypted. If SME is enabled in the first kernel, the crash kernel's +page table entries (pgd/pud/pmd/pte) contain the memory encryption +mask. This is used to remove the SME mask and obtain the true physical +address. + +Currently, sme_mask stores the value of the C-bit position. If needed, +additional SME-relevant info can be placed in that variable. + +For example:: + + [ misc ][ enc bit ][ other misc SME info ] + 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 + 63 59 55 51 47 43 39 35 31 27 ... 3 + +x86_32 +====== + +X86_PAE +------- + +Denotes whether physical address extensions are enabled. It has the cost +of a higher page table lookup overhead, and also consumes more page +table space per process. Used to check whether PAE was enabled in the +crash kernel when converting virtual addresses to physical addresses. + +ia64 +==== + +pgdat_list|(pgdat_list, MAX_NUMNODES) +------------------------------------- + +pg_data_t array storing all NUMA nodes information. MAX_NUMNODES +indicates the number of the nodes. + +node_memblk|(node_memblk, NR_NODE_MEMBLKS) +------------------------------------------ + +List of node memory chunks. Filled when parsing the SRAT table to obtain +information about memory nodes. NR_NODE_MEMBLKS indicates the number of +node memory chunks. + +These values are used to compute the number of nodes the crashed kernel used. + +node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size) +---------------------------------------------------------------- + +The size of a struct node_memblk_s and the offsets of the +node_memblk_s's members. Used to compute the number of nodes. + +PGTABLE_3|PGTABLE_4 +------------------- + +User-space tools need to know whether the crash kernel was in 3-level or +4-level paging mode. Used to distinguish the page table. + +ARM64 +===== + +VA_BITS +------- + +The maximum number of bits for virtual addresses. Used to compute the +virtual memory ranges. + +kimage_voffset +-------------- + +The offset between the kernel virtual and physical mappings. Used to +translate virtual to physical addresses. + +PHYS_OFFSET +----------- + +Indicates the physical address of the start of memory. Similar to +kimage_voffset, which is used to translate virtual to physical +addresses. + +KERNELOFFSET +------------ + +The kernel randomization offset. Used to compute the page offset. If +KASLR is disabled, this value is zero. + +arm +=== + +ARM_LPAE +-------- + +It indicates whether the crash kernel supports large physical address +extensions. Used to translate virtual to physical addresses. + +s390 +==== + +lowcore_ptr +----------- + +An array with a pointer to the lowcore of every CPU. Used to print the +psw and all registers information. + +high_memory +----------- + +Used to get the vmalloc_start address from the high_memory symbol. + +(lowcore_ptr, NR_CPUS) +---------------------- + +The maximum number of CPUs. + +powerpc +======= + + +node_data|(node_data, MAX_NUMNODES) +----------------------------------- + +See above. + +contig_page_data +---------------- + +See above. + +vmemmap_list +------------ + +The vmemmap_list maintains the entire vmemmap physical mapping. Used +to get vmemmap list count and populated vmemmap regions info. If the +vmemmap address translation information is stored in the crash kernel, +it is used to translate vmemmap kernel virtual addresses. + +mmu_vmemmap_psize +----------------- + +The size of a page. Used to translate virtual to physical addresses. + +mmu_psize_defs +-------------- + +Page size definitions, i.e. 4k, 64k, or 16M. + +Used to make vtop translations. + +vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) +-------------------------------------------------------------------------------------------- + +The vmemmap virtual address space management does not have a traditional +page table to track which virtual struct pages are backed by a physical +mapping. The virtual to physical mappings are tracked in a simple linked +list format. + +User-space tools need to know the offset of list, phys and virt_addr +when computing the count of vmemmap regions. + +mmu_psize_def|(mmu_psize_def, shift) +------------------------------------ + +The size of a struct mmu_psize_def and the offset of mmu_psize_def's +member. + +Used in vtop translations. + +sh +== + +node_data|(node_data, MAX_NUMNODES) +----------------------------------- + +See above. + +X2TLB +----- + +Indicates whether the crashed kernel enabled SH extended mode. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4821175a3769..e645b3ab4b6f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -708,14 +708,14 @@ [KNL, x86_64] select a region under 4G first, and fall back to reserve region above 4G when '@offset' hasn't been specified. - See Documentation/kdump/kdump.rst for further details. + See Documentation/admin-guide/kdump/kdump.rst for further details. crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is start-[end] where start and end are both a memory unit (amount[KMG]). See also - Documentation/kdump/kdump.rst for an example. + Documentation/admin-guide/kdump/kdump.rst for an example. crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel @@ -1207,7 +1207,7 @@ Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. - See Documentation/kdump/kdump.rst for details. + See Documentation/admin-guide/kdump/kdump.rst for details. enable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous diff --git a/Documentation/kdump/gdbmacros.txt b/Documentation/kdump/gdbmacros.txt deleted file mode 100644 index 220d0a80ca2c..000000000000 --- a/Documentation/kdump/gdbmacros.txt +++ /dev/null @@ -1,264 +0,0 @@ -# -# This file contains a few gdb macros (user defined commands) to extract -# useful information from kernel crashdump (kdump) like stack traces of -# all the processes or a particular process and trapinfo. -# -# These macros can be used by copying this file in .gdbinit (put in home -# directory or current directory) or by invoking gdb command with -# --command= option -# -# Credits: -# Alexander Nyberg -# V Srivatsa -# Maneesh Soni -# - -define bttnobp - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $stacksize = sizeof(union thread_union) - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm - printf "===================\n" - set var $stackp = $next_t.thread.sp - set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize - - while ($stackp < $stack_top) - if (*($stackp) > _stext && *($stackp) < _sinittext) - info symbol *($stackp) - end - set $stackp += 4 - end - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm - printf "===================\n" - set var $stackp = $next_t.thread.sp - set var $stack_top = ($stackp & ~($stacksize - 1)) + stacksize - - while ($stackp < $stack_top) - if (*($stackp) > _stext && *($stackp) < _sinittext) - info symbol *($stackp) - end - set $stackp += 4 - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end -end -document bttnobp - dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER -end - -define btthreadstack - set var $pid_task = $arg0 - - printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm - printf "task struct: " - print $pid_task - printf "===================\n" - set var $stackp = $pid_task.thread.sp - set var $stacksize = sizeof(union thread_union) - set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize - set var $stack_bot = ($stackp & ~($stacksize - 1)) - - set $stackp = *((unsigned long *) $stackp) - while (($stackp < $stack_top) && ($stackp > $stack_bot)) - set var $addr = *(((unsigned long *) $stackp) + 1) - info symbol $addr - set $stackp = *((unsigned long *) $stackp) - end -end -document btthreadstack - dump a thread stack using the given task structure pointer -end - - -define btt - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - btthreadstack $next_t - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - btthreadstack $next_th - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end -end -document btt - dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER -end - -define btpid - set var $pid = $arg0 - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $pid_task = 0 - - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - - if ($next_t.pid == $pid) - set $pid_task = $next_t - end - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - if ($next_th.pid == $pid) - set $pid_task = $next_th - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end - - btthreadstack $pid_task -end -document btpid - backtrace of pid -end - - -define trapinfo - set var $pid = $arg0 - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $pid_task = 0 - - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - - if ($next_t.pid == $pid) - set $pid_task = $next_t - end - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - if ($next_th.pid == $pid) - set $pid_task = $next_th - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end - - printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \ - $pid_task.thread.cr2, $pid_task.thread.error_code - -end -document trapinfo - Run info threads and lookup pid of thread #1 - 'trapinfo ' will tell you by which trap & possibly - address the kernel panicked. -end - -define dump_log_idx - set $idx = $arg0 - if ($argc > 1) - set $prev_flags = $arg1 - else - set $prev_flags = 0 - end - set $msg = ((struct printk_log *) (log_buf + $idx)) - set $prefix = 1 - set $newline = 1 - set $log = log_buf + $idx + sizeof(*$msg) - - # prev & LOG_CONT && !(msg->flags & LOG_PREIX) - if (($prev_flags & 8) && !($msg->flags & 4)) - set $prefix = 0 - end - - # msg->flags & LOG_CONT - if ($msg->flags & 8) - # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) - if (($prev_flags & 8) && !($prev_flags & 2)) - set $prefix = 0 - end - # (!(msg->flags & LOG_NEWLINE)) - if (!($msg->flags & 2)) - set $newline = 0 - end - end - - if ($prefix) - printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 - end - if ($msg->text_len != 0) - eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len - end - if ($newline) - printf "\n" - end - if ($msg->dict_len > 0) - set $dict = $log + $msg->text_len - set $idx = 0 - set $line = 1 - while ($idx < $msg->dict_len) - if ($line) - printf " " - set $line = 0 - end - set $c = $dict[$idx] - if ($c == '\0') - printf "\n" - set $line = 1 - else - if ($c < ' ' || $c >= 127 || $c == '\\') - printf "\\x%02x", $c - else - printf "%c", $c - end - end - set $idx = $idx + 1 - end - printf "\n" - end -end -document dump_log_idx - Dump a single log given its index in the log buffer. The first - parameter is the index into log_buf, the second is optional and - specified the previous log buffer's flags, used for properly - formatting continued lines. -end - -define dmesg - set $i = log_first_idx - set $end_idx = log_first_idx - set $prev_flags = 0 - - while (1) - set $msg = ((struct printk_log *) (log_buf + $i)) - if ($msg->len == 0) - set $i = 0 - else - dump_log_idx $i $prev_flags - set $i = $i + $msg->len - set $prev_flags = $msg->flags - end - if ($i == $end_idx) - loop_break - end - end -end -document dmesg - print the kernel ring buffer -end diff --git a/Documentation/kdump/index.rst b/Documentation/kdump/index.rst deleted file mode 100644 index 2b17fcf6867a..000000000000 --- a/Documentation/kdump/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -:orphan: - -================================================================ -Documentation for Kdump - The kexec-based Crash Dumping Solution -================================================================ - -This document includes overview, setup and installation, and analysis -information. - -.. toctree:: - :maxdepth: 1 - - kdump - vmcoreinfo - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/kdump/kdump.rst b/Documentation/kdump/kdump.rst deleted file mode 100644 index ac7e131d2935..000000000000 --- a/Documentation/kdump/kdump.rst +++ /dev/null @@ -1,534 +0,0 @@ -================================================================ -Documentation for Kdump - The kexec-based Crash Dumping Solution -================================================================ - -This document includes overview, setup and installation, and analysis -information. - -Overview -======== - -Kdump uses kexec to quickly boot to a dump-capture kernel whenever a -dump of the system kernel's memory needs to be taken (for example, when -the system panics). The system kernel's memory image is preserved across -the reboot and is accessible to the dump-capture kernel. - -You can use common commands, such as cp and scp, to copy the -memory image to a dump file on the local disk, or across the network to -a remote system. - -Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, -s390x, arm and arm64 architectures. - -When the system kernel boots, it reserves a small section of memory for -the dump-capture kernel. This ensures that ongoing Direct Memory Access -(DMA) from the system kernel does not corrupt the dump-capture kernel. -The kexec -p command loads the dump-capture kernel into this reserved -memory. - -On x86 machines, the first 640 KB of physical memory is needed to boot, -regardless of where the kernel loads. Therefore, kexec backs up this -region just before rebooting into the dump-capture kernel. - -Similarly on PPC64 machines first 32KB of physical memory is needed for -booting regardless of where the kernel is loaded and to support 64K page -size kexec backs up the first 64KB memory. - -For s390x, when kdump is triggered, the crashkernel region is exchanged -with the region [0, crashkernel region size] and then the kdump kernel -runs in [0, crashkernel region size]. Therefore no relocatable kernel is -needed for s390x. - -All of the necessary information about the system kernel's core image is -encoded in the ELF format, and stored in a reserved area of memory -before a crash. The physical address of the start of the ELF header is -passed to the dump-capture kernel through the elfcorehdr= boot -parameter. Optionally the size of the ELF header can also be passed -when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax. - - -With the dump-capture kernel, you can access the memory image through -/proc/vmcore. This exports the dump as an ELF-format file that you can -write out using file copy commands such as cp or scp. Further, you can -use analysis tools such as the GNU Debugger (GDB) and the Crash tool to -debug the dump file. This method ensures that the dump pages are correctly -ordered. - - -Setup and Installation -====================== - -Install kexec-tools -------------------- - -1) Login as the root user. - -2) Download the kexec-tools user-space package from the following URL: - -http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz - -This is a symlink to the latest version. - -The latest kexec-tools git tree is available at: - -- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git -- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git - -There is also a gitweb interface available at -http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git - -More information about kexec-tools can be found at -http://horms.net/projects/kexec/ - -3) Unpack the tarball with the tar command, as follows:: - - tar xvpzf kexec-tools.tar.gz - -4) Change to the kexec-tools directory, as follows:: - - cd kexec-tools-VERSION - -5) Configure the package, as follows:: - - ./configure - -6) Compile the package, as follows:: - - make - -7) Install the package, as follows:: - - make install - - -Build the system and dump-capture kernels ------------------------------------------ -There are two possible methods of using Kdump. - -1) Build a separate custom dump-capture kernel for capturing the - kernel core dump. - -2) Or use the system kernel binary itself as dump-capture kernel and there is - no need to build a separate dump-capture kernel. This is possible - only with the architectures which support a relocatable kernel. As - of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support - relocatable kernel. - -Building a relocatable kernel is advantageous from the point of view that -one does not have to build a second kernel for capturing the dump. But -at the same time one might want to build a custom dump capture kernel -suitable to his needs. - -Following are the configuration setting required for system and -dump-capture kernels for enabling kdump support. - -System kernel config options ----------------------------- - -1) Enable "kexec system call" in "Processor type and features.":: - - CONFIG_KEXEC=y - -2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo - filesystems." This is usually enabled by default:: - - CONFIG_SYSFS=y - - Note that "sysfs file system support" might not appear in the "Pseudo - filesystems" menu if "Configure standard kernel features (for small - systems)" is not enabled in "General Setup." In this case, check the - .config file itself to ensure that sysfs is turned on, as follows:: - - grep 'CONFIG_SYSFS' .config - -3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: - - CONFIG_DEBUG_INFO=Y - - This causes the kernel to be built with debug symbols. The dump - analysis tools require a vmlinux with debug symbols in order to read - and analyze a dump file. - -Dump-capture kernel config options (Arch Independent) ------------------------------------------------------ - -1) Enable "kernel crash dumps" support under "Processor type and - features":: - - CONFIG_CRASH_DUMP=y - -2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: - - CONFIG_PROC_VMCORE=y - - (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) - -Dump-capture kernel config options (Arch Dependent, i386 and x86_64) --------------------------------------------------------------------- - -1) On i386, enable high memory support under "Processor type and - features":: - - CONFIG_HIGHMEM64G=y - - or:: - - CONFIG_HIGHMEM4G - -2) On i386 and x86_64, disable symmetric multi-processing support - under "Processor type and features":: - - CONFIG_SMP=n - - (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line - when loading the dump-capture kernel, see section "Load the Dump-capture - Kernel".) - -3) If one wants to build and use a relocatable kernel, - Enable "Build a relocatable kernel" support under "Processor type and - features":: - - CONFIG_RELOCATABLE=y - -4) Use a suitable value for "Physical address where the kernel is - loaded" (under "Processor type and features"). This only appears when - "kernel crash dumps" is enabled. A suitable value depends upon - whether kernel is relocatable or not. - - If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000 - This will compile the kernel for physical address 1MB, but given the fact - kernel is relocatable, it can be run from any physical address hence - kexec boot loader will load it in memory region reserved for dump-capture - kernel. - - Otherwise it should be the start of memory region reserved for - second kernel using boot parameter "crashkernel=Y@X". Here X is - start of memory region reserved for dump-capture kernel. - Generally X is 16MB (0x1000000). So you can set - CONFIG_PHYSICAL_START=0x1000000 - -5) Make and install the kernel and its modules. DO NOT add this kernel - to the boot loader configuration files. - -Dump-capture kernel config options (Arch Dependent, ppc64) ----------------------------------------------------------- - -1) Enable "Build a kdump crash kernel" support under "Kernel" options:: - - CONFIG_CRASH_DUMP=y - -2) Enable "Build a relocatable kernel" support:: - - CONFIG_RELOCATABLE=y - - Make and install the kernel and its modules. - -Dump-capture kernel config options (Arch Dependent, ia64) ----------------------------------------------------------- - -- No specific options are required to create a dump-capture kernel - for ia64, other than those specified in the arch independent section - above. This means that it is possible to use the system kernel - as a dump-capture kernel if desired. - - The crashkernel region can be automatically placed by the system - kernel at run time. This is done by specifying the base address as 0, - or omitting it all together:: - - crashkernel=256M@0 - - or:: - - crashkernel=256M - - If the start address is specified, note that the start address of the - kernel will be aligned to 64Mb, so if the start address is not then - any space below the alignment point will be wasted. - -Dump-capture kernel config options (Arch Dependent, arm) ----------------------------------------------------------- - -- To use a relocatable kernel, - Enable "AUTO_ZRELADDR" support under "Boot" options:: - - AUTO_ZRELADDR=y - -Dump-capture kernel config options (Arch Dependent, arm64) ----------------------------------------------------------- - -- Please note that kvm of the dump-capture kernel will not be enabled - on non-VHE systems even if it is configured. This is because the CPU - will not be reset to EL2 on panic. - -Extended crashkernel syntax -=========================== - -While the "crashkernel=size[@offset]" syntax is sufficient for most -configurations, sometimes it's handy to have the reserved memory dependent -on the value of System RAM -- that's mostly for distributors that pre-setup -the kernel command line to avoid a unbootable system after some memory has -been removed from the machine. - -The syntax is:: - - crashkernel=:[,:,...][@offset] - range=start-[end] - -For example:: - - crashkernel=512M-2G:64M,2G-:128M - -This would mean: - - 1) if the RAM is smaller than 512M, then don't reserve anything - (this is the "rescue" case) - 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M - 3) if the RAM size is larger than 2G, then reserve 128M - - - -Boot into System Kernel -======================= - -1) Update the boot loader (such as grub, yaboot, or lilo) configuration - files as necessary. - -2) Boot the system kernel with the boot parameter "crashkernel=Y@X", - where Y specifies how much memory to reserve for the dump-capture kernel - and X specifies the beginning of this reserved memory. For example, - "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory - starting at physical address 0x01000000 (16MB) for the dump-capture kernel. - - On x86 and x86_64, use "crashkernel=64M@16M". - - On ppc64, use "crashkernel=128M@32M". - - On ia64, 256M@256M is a generous value that typically works. - The region may be automatically placed on ia64, see the - dump-capture kernel config option notes above. - If use sparse memory, the size should be rounded to GRANULE boundaries. - - On s390x, typically use "crashkernel=xxM". The value of xx is dependent - on the memory consumption of the kdump system. In general this is not - dependent on the memory size of the production system. - - On arm, the use of "crashkernel=Y@X" is no longer necessary; the - kernel will automatically locate the crash kernel image within the - first 512MB of RAM if X is not given. - - On arm64, use "crashkernel=Y[@X]". Note that the start address of - the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). - -Load the Dump-capture Kernel -============================ - -After booting to the system kernel, dump-capture kernel needs to be -loaded. - -Based on the architecture and type of image (relocatable or not), one -can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz -of dump-capture kernel. Following is the summary. - -For i386 and x86_64: - - - Use vmlinux if kernel is not relocatable. - - Use bzImage/vmlinuz if kernel is relocatable. - -For ppc64: - - - Use vmlinux - -For ia64: - - - Use vmlinux or vmlinuz.gz - -For s390x: - - - Use image or bzImage - -For arm: - - - Use zImage - -For arm64: - - - Use vmlinux or Image - -If you are using an uncompressed vmlinux image then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= --args-linux \ - --append="root= " - -If you are using a compressed bzImage/vmlinuz, then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= \ - --append="root= " - -If you are using a compressed zImage, then use following command -to load dump-capture kernel:: - - kexec --type zImage -p \ - --initrd= \ - --dtb= \ - --append="root= " - -If you are using an uncompressed Image, then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= \ - --append="root= " - -Please note, that --args-linux does not need to be specified for ia64. -It is planned to make this a no-op on that architecture, but for now -it should be omitted - -Following are the arch specific command line options to be used while -loading dump-capture kernel. - -For i386, x86_64 and ia64: - - "1 irqpoll maxcpus=1 reset_devices" - -For ppc64: - - "1 maxcpus=1 noirqdistrib reset_devices" - -For s390x: - - "1 maxcpus=1 cgroup_disable=memory" - -For arm: - - "1 maxcpus=1 reset_devices" - -For arm64: - - "1 maxcpus=1 reset_devices" - -Notes on loading the dump-capture kernel: - -* By default, the ELF headers are stored in ELF64 format to support - systems with more than 4GB memory. On i386, kexec automatically checks if - the physical RAM size exceeds the 4 GB limit and if not, uses ELF32. - So, on non-PAE systems, ELF32 is always used. - - The --elf32-core-headers option can be used to force the generation of ELF32 - headers. This is necessary because GDB currently cannot open vmcore files - with ELF64 headers on 32-bit systems. - -* The "irqpoll" boot parameter reduces driver initialization failures - due to shared interrupts in the dump-capture kernel. - -* You must specify in the format corresponding to the root - device name in the output of mount command. - -* Boot parameter "1" boots the dump-capture kernel into single-user - mode without networking. If you want networking, use "3". - -* We generally don't have to bring up a SMP kernel just to capture the - dump. Hence generally it is useful either to build a UP dump-capture - kernel or specify maxcpus=1 option while loading dump-capture kernel. - Note, though maxcpus always works, you had better replace it with - nr_cpus to save memory if supported by the current ARCH, such as x86. - -* You should enable multi-cpu support in dump-capture kernel if you intend - to use multi-thread programs with it, such as parallel dump feature of - makedumpfile. Otherwise, the multi-thread program may have a great - performance degradation. To enable multi-cpu support, you should bring up an - SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] - options while loading it. - -* For s390x there are two kdump modes: If a ELF header is specified with - the elfcorehdr= kernel parameter, it is used by the kdump kernel as it - is done on all other architectures. If no elfcorehdr= kernel parameter is - specified, the s390x kdump kernel dynamically creates the header. The - second mode has the advantage that for CPU and memory hotplug, kdump has - not to be reloaded with kexec_load(). - -* For s390x systems with many attached devices the "cio_ignore" kernel - parameter should be used for the kdump kernel in order to prevent allocation - of kernel memory for devices that are not relevant for kdump. The same - applies to systems that use SCSI/FCP devices. In that case the - "allow_lun_scan" zfcp module parameter should be set to zero before - setting FCP devices online. - -Kernel Panic -============ - -After successfully loading the dump-capture kernel as previously -described, the system will reboot into the dump-capture kernel if a -system crash is triggered. Trigger points are located in panic(), -die(), die_nmi() and in the sysrq handler (ALT-SysRq-c). - -The following conditions will execute a crash trigger point: - -If a hard lockup is detected and "NMI watchdog" is configured, the system -will boot into the dump-capture kernel ( die_nmi() ). - -If die() is called, and it happens to be a thread with pid 0 or 1, or die() -is called inside interrupt context or die() is called and panic_on_oops is set, -the system will boot into the dump-capture kernel. - -On powerpc systems when a soft-reset is generated, die() is called by all cpus -and the system will boot into the dump-capture kernel. - -For testing purposes, you can trigger a crash by using "ALT-SysRq-c", -"echo c > /proc/sysrq-trigger" or write a module to force the panic. - -Write Out the Dump File -======================= - -After the dump-capture kernel is booted, write out the dump file with -the following command:: - - cp /proc/vmcore - - -Analysis -======== - -Before analyzing the dump image, you should reboot into a stable kernel. - -You can do limited analysis using GDB on the dump file copied out of -/proc/vmcore. Use the debug vmlinux built with -g and run the following -command:: - - gdb vmlinux - -Stack trace for the task on processor 0, register display, and memory -display work fine. - -Note: GDB cannot analyze core files generated in ELF64 format for x86. -On systems with a maximum of 4GB of memory, you can generate -ELF32-format headers using the --elf32-core-headers kernel option on the -dump kernel. - -You can also use the Crash utility to analyze dump files in Kdump -format. Crash is available on Dave Anderson's site at the following URL: - - http://people.redhat.com/~anderson/ - -Trigger Kdump on WARN() -======================= - -The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This -will cause a kdump to occur at the panic() call. In cases where a user wants -to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 -to achieve the same behaviour. - -Contact -======= - -- Vivek Goyal (vgoyal@redhat.com) -- Maneesh Soni (maneesh@in.ibm.com) - -GDB macros -========== - -.. include:: gdbmacros.txt - :literal: diff --git a/Documentation/kdump/vmcoreinfo.rst b/Documentation/kdump/vmcoreinfo.rst deleted file mode 100644 index 007a6b86e0ee..000000000000 --- a/Documentation/kdump/vmcoreinfo.rst +++ /dev/null @@ -1,488 +0,0 @@ -========== -VMCOREINFO -========== - -What is it? -=========== - -VMCOREINFO is a special ELF note section. It contains various -information from the kernel like structure size, page size, symbol -values, field offsets, etc. These data are packed into an ELF note -section and used by user-space tools like crash and makedumpfile to -analyze a kernel's memory layout. - -Common variables -================ - -init_uts_ns.name.release ------------------------- - -The version of the Linux kernel. Used to find the corresponding source -code from which the kernel has been built. For example, crash uses it to -find the corresponding vmlinux in order to process vmcore. - -PAGE_SIZE ---------- - -The size of a page. It is the smallest unit of data used by the memory -management facilities. It is usually 4096 bytes of size and a page is -aligned on 4096 bytes. Used for computing page addresses. - -init_uts_ns ------------ - -The UTS namespace which is used to isolate two specific elements of the -system that relate to the uname(2) system call. It is named after the -data structure used to store information returned by the uname(2) system -call. - -User-space tools can get the kernel name, host name, kernel release -number, kernel version, architecture name and OS type from it. - -node_online_map ---------------- - -An array node_states[N_ONLINE] which represents the set of online nodes -in a system, one bit position per node number. Used to keep track of -which nodes are in the system and online. - -swapper_pg_dir --------------- - -The global page directory pointer of the kernel. Used to translate -virtual to physical addresses. - -_stext ------- - -Defines the beginning of the text section. In general, _stext indicates -the kernel start address. Used to convert a virtual address from the -direct kernel map to a physical address. - -vmap_area_list --------------- - -Stores the virtual area list. makedumpfile gets the vmalloc start value -from this variable and its value is necessary for vmalloc translation. - -mem_map -------- - -Physical addresses are translated to struct pages by treating them as -an index into the mem_map array. Right-shifting a physical address -PAGE_SHIFT bits converts it into a page frame number which is an index -into that mem_map array. - -Used to map an address to the corresponding struct page. - -contig_page_data ----------------- - -Makedumpfile gets the pglist_data structure from this symbol, which is -used to describe the memory layout. - -User-space tools use this to exclude free pages when dumping memory. - -mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map) --------------------------------------------------------------------------- - -The address of the mem_section array, its length, structure size, and -the section_mem_map offset. - -It exists in the sparse memory mapping model, and it is also somewhat -similar to the mem_map variable, both of them are used to translate an -address. - -page ----- - -The size of a page structure. struct page is an important data structure -and it is widely used to compute contiguous memory. - -pglist_data ------------ - -The size of a pglist_data structure. This value is used to check if the -pglist_data structure is valid. It is also used for checking the memory -type. - -zone ----- - -The size of a zone structure. This value is used to check if the zone -structure has been found. It is also used for excluding free pages. - -free_area ---------- - -The size of a free_area structure. It indicates whether the free_area -structure is valid or not. Useful when excluding free pages. - -list_head ---------- - -The size of a list_head structure. Used when iterating lists in a -post-mortem analysis session. - -nodemask_t ----------- - -The size of a nodemask_t type. Used to compute the number of online -nodes. - -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) -------------------------------------------------------------------------------------------------- - -User-space tools compute their values based on the offset of these -variables. The variables are used when excluding unnecessary pages. - -(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) ------------------------------------------------------------------------------------------ - -On NUMA machines, each NUMA node has a pg_data_t to describe its memory -layout. On UMA machines there is a single pglist_data which describes the -whole memory. - -These values are used to check the memory type and to compute the -virtual address for memory map. - -(zone, free_area|vm_stat|spanned_pages) ---------------------------------------- - -Each node is divided into a number of blocks called zones which -represent ranges within memory. A zone is described by a structure zone. - -User-space tools compute required values based on the offset of these -variables. - -(free_area, free_list) ----------------------- - -Offset of the free_list's member. This value is used to compute the number -of free pages. - -Each zone has a free_area structure array called free_area[MAX_ORDER]. -The free_list represents a linked list of free page blocks. - -(list_head, next|prev) ----------------------- - -Offsets of the list_head's members. list_head is used to define a -circular linked list. User-space tools need these in order to traverse -lists. - -(vmap_area, va_start|list) --------------------------- - -Offsets of the vmap_area's members. They carry vmalloc-specific -information. Makedumpfile gets the start address of the vmalloc region -from this. - -(zone.free_area, MAX_ORDER) ---------------------------- - -Free areas descriptor. User-space tools use this value to iterate the -free_area ranges. MAX_ORDER is used by the zone buddy allocator. - -log_first_idx -------------- - -Index of the first record stored in the buffer log_buf. Used by -user-space tools to read the strings in the log_buf. - -log_buf -------- - -Console output is written to the ring buffer log_buf at index -log_first_idx. Used to get the kernel log. - -log_buf_len ------------ - -log_buf's length. - -clear_idx ---------- - -The index that the next printk() record to read after the last clear -command. It indicates the first record after the last SYSLOG_ACTION -_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump -the dmesg log. - -log_next_idx ------------- - -The index of the next record to store in the buffer log_buf. Used to -compute the index of the current buffer position. - -printk_log ----------- - -The size of a structure printk_log. Used to compute the size of -messages, and extract dmesg log. It encapsulates header information for -log_buf, such as timestamp, syslog level, etc. - -(printk_log, ts_nsec|len|text_len|dict_len) -------------------------------------------- - -It represents field offsets in struct printk_log. User space tools -parse it and check whether the values of printk_log's members have been -changed. - -(free_area.free_list, MIGRATE_TYPES) ------------------------------------- - -The number of migrate types for pages. The free_list is described by the -array. Used by tools to compute the number of free pages. - -NR_FREE_PAGES -------------- - -On linux-2.6.21 or later, the number of free pages is in -vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. - -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask ------------------------------------------------------------------------------- - -Page attributes. These flags are used to filter various unnecessary for -dumping pages. - -PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) ------------------------------------------------------------------------------ - -More page attributes. These flags are used to filter various unnecessary for -dumping pages. - - -HUGETLB_PAGE_DTOR ------------------ - -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile -excludes these pages. - -x86_64 -====== - -phys_base ---------- - -Used to convert the virtual address of an exported kernel symbol to its -corresponding physical address. - -init_top_pgt ------------- - -Used to walk through the whole page table and convert virtual addresses -to physical addresses. The init_top_pgt is somewhat similar to -swapper_pg_dir, but it is only used in x86_64. - -pgtable_l5_enabled ------------------- - -User-space tools need to know whether the crash kernel was in 5-level -paging mode. - -node_data ---------- - -This is a struct pglist_data array and stores all NUMA nodes -information. Makedumpfile gets the pglist_data structure from it. - -(node_data, MAX_NUMNODES) -------------------------- - -The maximum number of nodes in system. - -KERNELOFFSET ------------- - -The kernel randomization offset. Used to compute the page offset. If -KASLR is disabled, this value is zero. - -KERNEL_IMAGE_SIZE ------------------ - -Currently unused by Makedumpfile. Used to compute the module virtual -address by Crash. - -sme_mask --------- - -AMD-specific with SME support: it indicates the secure memory encryption -mask. Makedumpfile tools need to know whether the crash kernel was -encrypted. If SME is enabled in the first kernel, the crash kernel's -page table entries (pgd/pud/pmd/pte) contain the memory encryption -mask. This is used to remove the SME mask and obtain the true physical -address. - -Currently, sme_mask stores the value of the C-bit position. If needed, -additional SME-relevant info can be placed in that variable. - -For example:: - - [ misc ][ enc bit ][ other misc SME info ] - 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 - 63 59 55 51 47 43 39 35 31 27 ... 3 - -x86_32 -====== - -X86_PAE -------- - -Denotes whether physical address extensions are enabled. It has the cost -of a higher page table lookup overhead, and also consumes more page -table space per process. Used to check whether PAE was enabled in the -crash kernel when converting virtual addresses to physical addresses. - -ia64 -==== - -pgdat_list|(pgdat_list, MAX_NUMNODES) -------------------------------------- - -pg_data_t array storing all NUMA nodes information. MAX_NUMNODES -indicates the number of the nodes. - -node_memblk|(node_memblk, NR_NODE_MEMBLKS) ------------------------------------------- - -List of node memory chunks. Filled when parsing the SRAT table to obtain -information about memory nodes. NR_NODE_MEMBLKS indicates the number of -node memory chunks. - -These values are used to compute the number of nodes the crashed kernel used. - -node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size) ----------------------------------------------------------------- - -The size of a struct node_memblk_s and the offsets of the -node_memblk_s's members. Used to compute the number of nodes. - -PGTABLE_3|PGTABLE_4 -------------------- - -User-space tools need to know whether the crash kernel was in 3-level or -4-level paging mode. Used to distinguish the page table. - -ARM64 -===== - -VA_BITS -------- - -The maximum number of bits for virtual addresses. Used to compute the -virtual memory ranges. - -kimage_voffset --------------- - -The offset between the kernel virtual and physical mappings. Used to -translate virtual to physical addresses. - -PHYS_OFFSET ------------ - -Indicates the physical address of the start of memory. Similar to -kimage_voffset, which is used to translate virtual to physical -addresses. - -KERNELOFFSET ------------- - -The kernel randomization offset. Used to compute the page offset. If -KASLR is disabled, this value is zero. - -arm -=== - -ARM_LPAE --------- - -It indicates whether the crash kernel supports large physical address -extensions. Used to translate virtual to physical addresses. - -s390 -==== - -lowcore_ptr ------------ - -An array with a pointer to the lowcore of every CPU. Used to print the -psw and all registers information. - -high_memory ------------ - -Used to get the vmalloc_start address from the high_memory symbol. - -(lowcore_ptr, NR_CPUS) ----------------------- - -The maximum number of CPUs. - -powerpc -======= - - -node_data|(node_data, MAX_NUMNODES) ------------------------------------ - -See above. - -contig_page_data ----------------- - -See above. - -vmemmap_list ------------- - -The vmemmap_list maintains the entire vmemmap physical mapping. Used -to get vmemmap list count and populated vmemmap regions info. If the -vmemmap address translation information is stored in the crash kernel, -it is used to translate vmemmap kernel virtual addresses. - -mmu_vmemmap_psize ------------------ - -The size of a page. Used to translate virtual to physical addresses. - -mmu_psize_defs --------------- - -Page size definitions, i.e. 4k, 64k, or 16M. - -Used to make vtop translations. - -vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) --------------------------------------------------------------------------------------------- - -The vmemmap virtual address space management does not have a traditional -page table to track which virtual struct pages are backed by a physical -mapping. The virtual to physical mappings are tracked in a simple linked -list format. - -User-space tools need to know the offset of list, phys and virt_addr -when computing the count of vmemmap regions. - -mmu_psize_def|(mmu_psize_def, shift) ------------------------------------- - -The size of a struct mmu_psize_def and the offset of mmu_psize_def's -member. - -Used in vtop translations. - -sh -== - -node_data|(node_data, MAX_NUMNODES) ------------------------------------ - -See above. - -X2TLB ------ - -Indicates whether the crashed kernel enabled SH extended mode. diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 0c41d6d463f3..10e7f4d16c14 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -59,7 +59,7 @@ as follows: the default calculated size. Use this option if default boot memory size is not sufficient for second kernel to boot successfully. For syntax of crashkernel= parameter, - refer to Documentation/kdump/kdump.rst. If any offset is + refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is provided in crashkernel= parameter, it will be ignored as fadump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. diff --git a/Documentation/translations/zh_CN/oops-tracing.txt b/Documentation/translations/zh_CN/oops-tracing.txt index 368ddd05b304..c5f3bda7abcb 100644 --- a/Documentation/translations/zh_CN/oops-tracing.txt +++ b/Documentation/translations/zh_CN/oops-tracing.txt @@ -53,8 +53,8 @@ cat /proc/kmsg > file, 然而你必须介入中止传输, kmsg是一个“ (2)用串口终端启动(请参看Documentation/admin-guide/serial-console.rst),运行一个null modem到另一台机器并用你喜欢的通讯工具获取输出。Minicom工作地很好。 -(3)使用Kdump(请参看Documentation/kdump/kdump.rst), -使用在Documentation/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 +(3)使用Kdump(请参看Documentation/admin-guide/kdump/kdump.rst), +使用在Documentation/admin-guide/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 环形缓冲区。 完整信息 diff --git a/MAINTAINERS b/MAINTAINERS index 288f84dbd480..b36028f43192 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8675,7 +8675,7 @@ R: Vivek Goyal L: kexec@lists.infradead.org W: http://lse.sourceforge.net/kdump/ S: Maintained -F: Documentation/kdump/ +F: Documentation/admin-guide/kdump/ KEENE FM RADIO TRANSMITTER DRIVER M: Hans Verkuil diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 6425871e9903..20afd6077465 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2036,7 +2036,7 @@ config CRASH_DUMP kdump/kexec. The crash dump kernel must be compiled to a memory address not used by the main kernel - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config AUTO_ZRELADDR bool "Auto calculation of the decompressed kernel image address" diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4b22bbf0590..86f81b5afd95 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -996,7 +996,7 @@ config CRASH_DUMP reserved region and then later executed after a crash by kdump/kexec. - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config XEN_DOM0 def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 31a7d12db705..c2858ac6a46a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -626,7 +626,7 @@ config CRASH_DUMP to a memory address not used by the main kernel using PHYSICAL_START. - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d0bbca65e4a4..9505066b7ba3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2057,7 +2057,7 @@ config CRASH_DUMP to a memory address not used by the main kernel or BIOS using PHYSICAL_START, or it must be built as a relocatable image (CONFIG_RELOCATABLE=y). - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config KEXEC_JUMP bool "kexec jump" @@ -2094,7 +2094,7 @@ config PHYSICAL_START the reserved region. In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed - kernel. Please take a look at Documentation/kdump/kdump.rst + kernel. Please take a look at Documentation/admin-guide/kdump/kdump.rst for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as -- cgit v1.2.3-55-g7522 From 4f4cfa6c560c93ba180c30675cf845e1597de44c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 14:56:51 -0300 Subject: docs: admin-guide: add a series of orphaned documents There are lots of documents that belong to the admin-guide but are on random places (most under Documentation root dir). Move them to the admin guide. Signed-off-by: Mauro Carvalho Chehab Acked-by: Alexandre Belloni Acked-by: Bartlomiej Zolnierkiewicz --- Documentation/ABI/stable/sysfs-devices-node | 2 +- Documentation/ABI/testing/procfs-diskstats | 2 +- Documentation/ABI/testing/sysfs-block | 2 +- Documentation/ABI/testing/sysfs-devices-system-cpu | 4 +- Documentation/admin-guide/btmrvl.rst | 124 +++++++ Documentation/admin-guide/clearing-warn-once.rst | 9 + Documentation/admin-guide/cpu-load.rst | 114 +++++++ Documentation/admin-guide/cputopology.rst | 177 ++++++++++ .../admin-guide/device-mapper/statistics.rst | 4 +- Documentation/admin-guide/efi-stub.rst | 100 ++++++ Documentation/admin-guide/highuid.rst | 80 +++++ Documentation/admin-guide/hw-vuln/l1tf.rst | 2 +- Documentation/admin-guide/hw_random.rst | 105 ++++++ Documentation/admin-guide/index.rst | 17 + Documentation/admin-guide/iostats.rst | 197 ++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 2 +- .../admin-guide/kernel-per-CPU-kthreads.rst | 356 +++++++++++++++++++++ Documentation/admin-guide/lcd-panel-cgram.rst | 27 ++ Documentation/admin-guide/ldm.rst | 121 +++++++ Documentation/admin-guide/lockup-watchdogs.rst | 83 +++++ Documentation/admin-guide/mm/cma_debugfs.rst | 25 ++ Documentation/admin-guide/mm/index.rst | 1 + Documentation/admin-guide/numastat.rst | 30 ++ Documentation/admin-guide/pnp.rst | 292 +++++++++++++++++ Documentation/admin-guide/rtc.rst | 140 ++++++++ Documentation/admin-guide/svga.rst | 249 ++++++++++++++ Documentation/admin-guide/sysctl/kernel.rst | 2 +- Documentation/admin-guide/video-output.rst | 34 ++ Documentation/auxdisplay/lcd-panel-cgram.rst | 29 -- Documentation/btmrvl.txt | 124 ------- Documentation/clearing-warn-once.txt | 9 - Documentation/cma/debugfs.rst | 27 -- Documentation/cpu-load.txt | 114 ------- Documentation/cputopology.txt | 177 ---------- Documentation/efi-stub.txt | 100 ------ Documentation/fb/vesafb.rst | 2 +- Documentation/highuid.txt | 80 ----- Documentation/hw_random.txt | 105 ------ Documentation/iostats.txt | 197 ------------ Documentation/kernel-per-CPU-kthreads.txt | 356 --------------------- Documentation/ldm.txt | 121 ------- Documentation/lockup-watchdogs.txt | 83 ----- Documentation/numastat.txt | 30 -- Documentation/pnp.txt | 292 ----------------- Documentation/rtc.txt | 140 -------- Documentation/svga.txt | 249 -------------- Documentation/video-output.txt | 34 -- Documentation/x86/topology.rst | 2 +- MAINTAINERS | 12 +- arch/arm/Kconfig | 2 +- arch/parisc/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 4 +- block/partitions/Kconfig | 2 +- drivers/char/Kconfig | 4 +- drivers/char/hw_random/core.c | 2 +- include/linux/hw_random.h | 2 +- 58 files changed, 2310 insertions(+), 2296 deletions(-) create mode 100644 Documentation/admin-guide/btmrvl.rst create mode 100644 Documentation/admin-guide/clearing-warn-once.rst create mode 100644 Documentation/admin-guide/cpu-load.rst create mode 100644 Documentation/admin-guide/cputopology.rst create mode 100644 Documentation/admin-guide/efi-stub.rst create mode 100644 Documentation/admin-guide/highuid.rst create mode 100644 Documentation/admin-guide/hw_random.rst create mode 100644 Documentation/admin-guide/iostats.rst create mode 100644 Documentation/admin-guide/kernel-per-CPU-kthreads.rst create mode 100644 Documentation/admin-guide/lcd-panel-cgram.rst create mode 100644 Documentation/admin-guide/ldm.rst create mode 100644 Documentation/admin-guide/lockup-watchdogs.rst create mode 100644 Documentation/admin-guide/mm/cma_debugfs.rst create mode 100644 Documentation/admin-guide/numastat.rst create mode 100644 Documentation/admin-guide/pnp.rst create mode 100644 Documentation/admin-guide/rtc.rst create mode 100644 Documentation/admin-guide/svga.rst create mode 100644 Documentation/admin-guide/video-output.rst delete mode 100644 Documentation/auxdisplay/lcd-panel-cgram.rst delete mode 100644 Documentation/btmrvl.txt delete mode 100644 Documentation/clearing-warn-once.txt delete mode 100644 Documentation/cma/debugfs.rst delete mode 100644 Documentation/cpu-load.txt delete mode 100644 Documentation/cputopology.txt delete mode 100644 Documentation/efi-stub.txt delete mode 100644 Documentation/highuid.txt delete mode 100644 Documentation/hw_random.txt delete mode 100644 Documentation/iostats.txt delete mode 100644 Documentation/kernel-per-CPU-kthreads.txt delete mode 100644 Documentation/ldm.txt delete mode 100644 Documentation/lockup-watchdogs.txt delete mode 100644 Documentation/numastat.txt delete mode 100644 Documentation/pnp.txt delete mode 100644 Documentation/rtc.txt delete mode 100644 Documentation/svga.txt delete mode 100644 Documentation/video-output.txt (limited to 'arch/x86') diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index f7ce68fbd4b9..df8413cf1468 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -61,7 +61,7 @@ Date: October 2002 Contact: Linux Memory Management list Description: The node's hit/miss statistics, in units of pages. - See Documentation/numastat.txt + See Documentation/admin-guide/numastat.rst What: /sys/devices/system/node/nodeX/distance Date: October 2002 diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats index abac31d216de..2c44b4f1b060 100644 --- a/Documentation/ABI/testing/procfs-diskstats +++ b/Documentation/ABI/testing/procfs-diskstats @@ -29,4 +29,4 @@ Description: 17 - sectors discarded 18 - time spent discarding - For more details refer to Documentation/iostats.txt + For more details refer to Documentation/admin-guide/iostats.rst diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index dfad7427817c..f8c7c7126bb1 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -15,7 +15,7 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) - For more details refer Documentation/iostats.txt + For more details refer Documentation/admin-guide/iostats.rst What: /sys/block///stat diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index d404603c6b52..5f7d7b14fa44 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -34,7 +34,7 @@ Description: CPU topology files that describe kernel limits related to present: cpus that have been identified as being present in the system. - See Documentation/cputopology.txt for more information. + See Documentation/admin-guide/cputopology.rst for more information. What: /sys/devices/system/cpu/probe @@ -103,7 +103,7 @@ Description: CPU topology files that describe a logical CPU's relationship thread_siblings_list: human-readable list of cpu#'s hardware threads within the same core as cpu# - See Documentation/cputopology.txt for more information. + See Documentation/admin-guide/cputopology.rst for more information. What: /sys/devices/system/cpu/cpuidle/current_driver diff --git a/Documentation/admin-guide/btmrvl.rst b/Documentation/admin-guide/btmrvl.rst new file mode 100644 index 000000000000..ec57740ead0c --- /dev/null +++ b/Documentation/admin-guide/btmrvl.rst @@ -0,0 +1,124 @@ +============= +btmrvl driver +============= + +All commands are used via debugfs interface. + +Set/get driver configurations +============================= + +Path: /debug/btmrvl/config/ + +gpiogap=[n], hscfgcmd + These commands are used to configure the host sleep parameters:: + bit 8:0 -- Gap + bit 16:8 -- GPIO + + where GPIO is the pin number of GPIO used to wake up the host. + It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface + wakeup will be used instead). + + where Gap is the gap in milli seconds between wakeup signal and + wakeup event, or 0xff for special host sleep setting. + + Usage:: + + # Use SDIO interface to wake up the host and set GAP to 0x80: + echo 0xff80 > /debug/btmrvl/config/gpiogap + echo 1 > /debug/btmrvl/config/hscfgcmd + + # Use GPIO pin #3 to wake up the host and set GAP to 0xff: + echo 0x03ff > /debug/btmrvl/config/gpiogap + echo 1 > /debug/btmrvl/config/hscfgcmd + +psmode=[n], pscmd + These commands are used to enable/disable auto sleep mode + + where the option is:: + + 1 -- Enable auto sleep mode + 0 -- Disable auto sleep mode + + Usage:: + + # Enable auto sleep mode + echo 1 > /debug/btmrvl/config/psmode + echo 1 > /debug/btmrvl/config/pscmd + + # Disable auto sleep mode + echo 0 > /debug/btmrvl/config/psmode + echo 1 > /debug/btmrvl/config/pscmd + + +hsmode=[n], hscmd + These commands are used to enable host sleep or wake up firmware + + where the option is:: + + 1 -- Enable host sleep + 0 -- Wake up firmware + + Usage:: + + # Enable host sleep + echo 1 > /debug/btmrvl/config/hsmode + echo 1 > /debug/btmrvl/config/hscmd + + # Wake up firmware + echo 0 > /debug/btmrvl/config/hsmode + echo 1 > /debug/btmrvl/config/hscmd + + +Get driver status +================= + +Path: /debug/btmrvl/status/ + +Usage:: + + cat /debug/btmrvl/status/ + +where the args are: + +curpsmode + This command displays current auto sleep status. + +psstate + This command display the power save state. + +hsstate + This command display the host sleep state. + +txdnldrdy + This command displays the value of Tx download ready flag. + +Issuing a raw hci command +========================= + +Use hcitool to issue raw hci command, refer to hcitool manual + +Usage:: + + Hcitool cmd [Parameters] + +Interface Control Command:: + + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface + +SD8688 firmware +=============== + +Images: + +- /lib/firmware/sd8688_helper.bin +- /lib/firmware/sd8688.bin + + +The images can be downloaded from: + +git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ diff --git a/Documentation/admin-guide/clearing-warn-once.rst b/Documentation/admin-guide/clearing-warn-once.rst new file mode 100644 index 000000000000..211fd926cf00 --- /dev/null +++ b/Documentation/admin-guide/clearing-warn-once.rst @@ -0,0 +1,9 @@ +Clearing WARN_ONCE +------------------ + +WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once. + +echo 1 > /sys/kernel/debug/clear_warn_once + +clears the state and allows the warnings to print once again. +This can be useful after test suite runs to reproduce problems. diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst new file mode 100644 index 000000000000..2d01ce43d2a2 --- /dev/null +++ b/Documentation/admin-guide/cpu-load.rst @@ -0,0 +1,114 @@ +======== +CPU load +======== + +Linux exports various bits of information via ``/proc/stat`` and +``/proc/uptime`` that userland tools, such as top(1), use to calculate +the average time system spent in a particular state, for example:: + + $ iostat + Linux 2.6.18.3-exp (linmac) 02/20/2007 + + avg-cpu: %user %nice %system %iowait %steal %idle + 10.01 0.00 2.92 5.44 0.00 81.63 + + ... + +Here the system thinks that over the default sampling period the +system spent 10.01% of the time doing work in user space, 2.92% in the +kernel, and was overall 81.63% of the time idle. + +In most cases the ``/proc/stat`` information reflects the reality quite +closely, however due to the nature of how/when the kernel collects +this data sometimes it can not be trusted at all. + +So how is this information collected? Whenever timer interrupt is +signalled the kernel looks what kind of task was running at this +moment and increments the counter that corresponds to this tasks +kind/state. The problem with this is that the system could have +switched between various states multiple times between two timer +interrupts yet the counter is incremented only for the last state. + + +Example +------- + +If we imagine the system with one task that periodically burns cycles +in the following manner:: + + time line between two timer interrupts + |--------------------------------------| + ^ ^ + |_ something begins working | + |_ something goes to sleep + (only to be awaken quite soon) + +In the above situation the system will be 0% loaded according to the +``/proc/stat`` (since the timer interrupt will always happen when the +system is executing the idle handler), but in reality the load is +closer to 99%. + +One can imagine many more situations where this behavior of the kernel +will lead to quite erratic information inside ``/proc/stat``:: + + + /* gcc -o hog smallhog.c */ + #include + #include + #include + #include + #define HIST 10 + + static volatile sig_atomic_t stop; + + static void sighandler (int signr) + { + (void) signr; + stop = 1; + } + static unsigned long hog (unsigned long niters) + { + stop = 0; + while (!stop && --niters); + return niters; + } + int main (void) + { + int i; + struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, + .it_value = { .tv_sec = 0, .tv_usec = 1 } }; + sigset_t set; + unsigned long v[HIST]; + double tmp = 0.0; + unsigned long n; + signal (SIGALRM, &sighandler); + setitimer (ITIMER_REAL, &it, NULL); + + hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) tmp += v[i]; + tmp /= HIST; + n = tmp - (tmp / 3.0); + + sigemptyset (&set); + sigaddset (&set, SIGALRM); + + for (;;) { + hog (n); + sigwait (&set, &i); + } + return 0; + } + + +References +---------- + +- http://lkml.org/lkml/2007/2/12/6 +- Documentation/filesystems/proc.txt (1.8) + + +Thanks +------ + +Con Kolivas, Pavel Machek diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst new file mode 100644 index 000000000000..b90dafcc8237 --- /dev/null +++ b/Documentation/admin-guide/cputopology.rst @@ -0,0 +1,177 @@ +=========================================== +How CPU topology info is exported via sysfs +=========================================== + +Export CPU topology info via sysfs. Items (attributes) are similar +to /proc/cpuinfo output of some architectures. They reside in +/sys/devices/system/cpu/cpuX/topology/: + +physical_package_id: + + physical package id of cpuX. Typically corresponds to a physical + socket number, but the actual value is architecture and platform + dependent. + +die_id: + + the CPU die ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +core_id: + + the CPU core ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +book_id: + + the book ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +drawer_id: + + the drawer ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +core_cpus: + + internal kernel map of CPUs within the same core. + (deprecated name: "thread_siblings") + +core_cpus_list: + + human-readable list of CPUs within the same core. + (deprecated name: "thread_siblings_list"); + +package_cpus: + + internal kernel map of the CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings") + +package_cpus_list: + + human-readable list of CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings_list") + +die_cpus: + + internal kernel map of CPUs within the same die. + +die_cpus_list: + + human-readable list of CPUs within the same die. + +book_siblings: + + internal kernel map of cpuX's hardware threads within the same + book_id. + +book_siblings_list: + + human-readable list of cpuX's hardware threads within the same + book_id. + +drawer_siblings: + + internal kernel map of cpuX's hardware threads within the same + drawer_id. + +drawer_siblings_list: + + human-readable list of cpuX's hardware threads within the same + drawer_id. + +Architecture-neutral, drivers/base/topology.c, exports these attributes. +However, the book and drawer related sysfs files will only be created if +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. + +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, +where they reflect the cpu and cache hierarchy. + +For an architecture to support this feature, it must define some of +these macros in include/asm-XXX/topology.h:: + + #define topology_physical_package_id(cpu) + #define topology_die_id(cpu) + #define topology_core_id(cpu) + #define topology_book_id(cpu) + #define topology_drawer_id(cpu) + #define topology_sibling_cpumask(cpu) + #define topology_core_cpumask(cpu) + #define topology_die_cpumask(cpu) + #define topology_book_cpumask(cpu) + #define topology_drawer_cpumask(cpu) + +The type of ``**_id macros`` is int. +The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter +correspond with appropriate ``**_siblings`` sysfs attributes (except for +topology_sibling_cpumask() which corresponds with thread_siblings). + +To be consistent on all architectures, include/linux/topology.h +provides default definitions for any of the above macros that are +not defined by include/asm-XXX/topology.h: + +1) topology_physical_package_id: -1 +2) topology_die_id: -1 +3) topology_core_id: 0 +4) topology_sibling_cpumask: just the given CPU +5) topology_core_cpumask: just the given CPU +6) topology_die_cpumask: just the given CPU + +For architectures that don't support books (CONFIG_SCHED_BOOK) there are no +default definitions for topology_book_id() and topology_book_cpumask(). +For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are +no default definitions for topology_drawer_id() and topology_drawer_cpumask(). + +Additionally, CPU topology information is provided under +/sys/devices/system/cpu and includes these files. The internal +source for the output is in brackets ("[]"). + + =========== ========================================================== + kernel_max: the maximum CPU index allowed by the kernel configuration. + [NR_CPUS-1] + + offline: CPUs that are not online because they have been + HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit + of CPUs allowed by the kernel configuration (kernel_max + above). [~cpu_online_mask + cpus >= NR_CPUS] + + online: CPUs that are online and being scheduled [cpu_online_mask] + + possible: CPUs that have been allocated resources and can be + brought online if they are present. [cpu_possible_mask] + + present: CPUs that have been identified as being present in the + system. [cpu_present_mask] + =========== ========================================================== + +The format for the above output is compatible with cpulist_parse() +[see ]. Some examples follow. + +In this example, there are 64 CPUs in the system but cpus 32-63 exceed +the kernel max which is limited to 0..31 by the NR_CPUS config option +being 32. Note also that CPUs 2 and 4-31 are not online but could be +brought online as they are both present and possible:: + + kernel_max: 31 + offline: 2,4-31,32-63 + online: 0-1,3 + possible: 0-31 + present: 0-31 + +In this example, the NR_CPUS config option is 128, but the kernel was +started with possible_cpus=144. There are 4 CPUs in the system and cpu2 +was manually taken offline (and is the only CPU that can be brought +online.):: + + kernel_max: 127 + offline: 2,4-127,128-143 + online: 0-1,3 + possible: 0-127 + present: 0-3 + +See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter +as well as more information on the various cpumasks. diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst index 3d80a9f850cc..41ded0bc5933 100644 --- a/Documentation/admin-guide/device-mapper/statistics.rst +++ b/Documentation/admin-guide/device-mapper/statistics.rst @@ -13,7 +13,7 @@ the range specified. The I/O statistics counters for each step-sized area of a region are in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: -Documentation/iostats.txt). But two extra counters (12 and 13) are +Documentation/admin-guide/iostats.rst). But two extra counters (12 and 13) are provided: total time spent reading and writing. When the histogram argument is used, the 14th parameter is reported that represents the histogram of latencies. All these counters may be accessed by sending @@ -151,7 +151,7 @@ Messages The first 11 counters have the same meaning as `/sys/block/*/stat or /proc/diskstats`. - Please refer to Documentation/iostats.txt for details. + Please refer to Documentation/admin-guide/iostats.rst for details. 1. the number of reads completed 2. the number of reads merged diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst new file mode 100644 index 000000000000..833edb0d0bc4 --- /dev/null +++ b/Documentation/admin-guide/efi-stub.rst @@ -0,0 +1,100 @@ +================= +The EFI Boot Stub +================= + +On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade +as a PE/COFF image, thereby convincing EFI firmware loaders to load +it as an EFI executable. The code that modifies the bzImage header, +along with the EFI-specific entry point that the firmware loader +jumps to are collectively known as the "EFI boot stub", and live in +arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, +respectively. For ARM the EFI stub is implemented in +arch/arm/boot/compressed/efi-header.S and +arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared +between architectures is in drivers/firmware/efi/libstub. + +For arm64, there is no compressed kernel support, so the Image itself +masquerades as a PE/COFF image and the EFI stub is linked into the +kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S +and drivers/firmware/efi/libstub/arm64-stub.c. + +By using the EFI boot stub it's possible to boot a Linux kernel +without the use of a conventional EFI boot loader, such as grub or +elilo. Since the EFI boot stub performs the jobs of a boot loader, in +a certain sense it *IS* the boot loader. + +The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. + + +How to install bzImage.efi +-------------------------- + +The bzImage located in arch/x86/boot/bzImage must be copied to the EFI +System Partition (ESP) and renamed with the extension ".efi". Without +the extension the EFI firmware loader will refuse to execute it. It's +not possible to execute bzImage.efi from the usual Linux file systems +because EFI firmware doesn't have support for them. For ARM the +arch/arm/boot/zImage should be copied to the system partition, and it +may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image +should be copied but not necessarily renamed. + + +Passing kernel parameters from the EFI shell +-------------------------------------------- + +Arguments to the kernel can be passed after bzImage.efi, e.g.:: + + fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 + + +The "initrd=" option +-------------------- + +Like most boot loaders, the EFI stub allows the user to specify +multiple initrd files using the "initrd=" option. This is the only EFI +stub-specific command line parameter, everything else is passed to the +kernel when it boots. + +The path to the initrd file must be an absolute path from the +beginning of the ESP, relative path names do not work. Also, the path +is an EFI-style path and directory elements must be separated with +backslashes (\). For example, given the following directory layout:: + + fs0:> + Kernels\ + bzImage.efi + initrd-large.img + + Ramdisks\ + initrd-small.img + initrd-medium.img + +to boot with the initrd-large.img file if the current working +directory is fs0:\Kernels, the following command must be used:: + + fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img + +Notice how bzImage.efi can be specified with a relative path. That's +because the image we're executing is interpreted by the EFI shell, +which understands relative paths, whereas the rest of the command line +is passed to bzImage.efi. + + +The "dtb=" option +----------------- + +For the ARM and arm64 architectures, a device tree must be provided to +the kernel. Normally firmware shall supply the device tree via the +EFI CONFIGURATION TABLE. However, the "dtb=" command line option can +be used to override the firmware supplied device tree, or to supply +one when firmware is unable to. + +Please note: Firmware adds runtime configuration information to the +device tree before booting the kernel. If dtb= is used to override +the device tree, then any runtime data provided by firmware will be +lost. The dtb= option should only be used either as a debug tool, or +as a last resort when a device tree is not provided in the EFI +CONFIGURATION TABLE. + +"dtb=" is processed in the same manner as the "initrd=" option that is +described above. diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst new file mode 100644 index 000000000000..6ee70465c0ea --- /dev/null +++ b/Documentation/admin-guide/highuid.rst @@ -0,0 +1,80 @@ +=================================================== +Notes on the change from 16-bit UIDs to 32-bit UIDs +=================================================== + +:Author: Chris Wing +:Last updated: January 11, 2000 + +- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t + when communicating between user and kernel space in an ioctl or data + structure. + +- kernel code should use uid_t and gid_t in kernel-private structures and + code. + +What's left to be done for 32-bit UIDs on all Linux architectures: + +- Disk quotas have an interesting limitation that is not related to the + maximum UID/GID. They are limited by the maximum file size on the + underlying filesystem, because quota records are written at offsets + corresponding to the UID in question. + Further investigation is needed to see if the quota system can cope + properly with huge UIDs. If it can deal with 64-bit file offsets on all + architectures, this should not be a problem. + +- Decide whether or not to keep backwards compatibility with the system + accounting file, or if we should break it as the comments suggest + (currently, the old 16-bit UID and GID are still written to disk, and + part of the former pad space is used to store separate 32-bit UID and + GID) + +- Need to validate that OS emulation calls the 16-bit UID + compatibility syscalls, if the OS being emulated used 16-bit UIDs, or + uses the 32-bit UID system calls properly otherwise. + + This affects at least: + + - iBCS on Intel + + - sparc32 emulation on sparc64 + (need to support whatever new 32-bit UID system calls are added to + sparc32) + +- Validate that all filesystems behave properly. + + At present, 32-bit UIDs _should_ work for: + + - ext2 + - ufs + - isofs + - nfs + - coda + - udf + + Ioctl() fixups have been made for: + + - ncpfs + - smbfs + + Filesystems with simple fixups to prevent 16-bit UID wraparound: + + - minix + - sysv + - qnx4 + + Other filesystems have not been checked yet. + +- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in + all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but + more are needed. (as well as new user<->kernel data structures) + +- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, + sh, and sparc32. Fixing this is probably not that important, but would + require adding a new ELF section. + +- The ioctl()s used to control the in-kernel NFS server only support + 16-bit UIDs on arm, i386, m68k, sh, and sparc32. + +- make sure that the UID mapping feature of AX25 networking works properly + (it should be safe because it's always used a 32-bit integer to + communicate between user and kernel) diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst index 656aee262e23..f83212fae4d5 100644 --- a/Documentation/admin-guide/hw-vuln/l1tf.rst +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst @@ -241,7 +241,7 @@ Guest mitigation mechanisms For further information about confining guests to a single or to a group of cores consult the cpusets documentation: - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst + https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst .. _interrupt_isolation: diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst new file mode 100644 index 000000000000..121de96e395e --- /dev/null +++ b/Documentation/admin-guide/hw_random.rst @@ -0,0 +1,105 @@ +========================================================== +Linux support for random number generator in i8xx chipsets +========================================================== + +Introduction +============ + +The hw_random framework is software that makes use of a +special hardware feature on your CPU or motherboard, +a Random Number Generator (RNG). The software has two parts: +a core providing the /dev/hwrng character device and its +sysfs support, plus a hardware-specific driver that plugs +into that core. + +To make the most effective use of these mechanisms, you +should download the support software as well. Download the +latest version of the "rng-tools" package from the +hw_random driver's official Web site: + + http://sourceforge.net/projects/gkernel/ + +Those tools use /dev/hwrng to fill the kernel entropy pool, +which is used internally and exported by the /dev/urandom and +/dev/random special files. + +Theory of operation +=================== + +CHARACTER DEVICE. Using the standard open() +and read() system calls, you can read random data from +the hardware RNG device. This data is NOT CHECKED by any +fitness tests, and could potentially be bogus (if the +hardware is faulty or has been tampered with). Data is only +output if the hardware "has-data" flag is set, but nevertheless +a security-conscious person would run fitness tests on the +data before assuming it is truly random. + +The rng-tools package uses such tests in "rngd", and lets you +run them by hand with a "rngtest" utility. + +/dev/hwrng is char device major 10, minor 183. + +CLASS DEVICE. There is a /sys/class/misc/hw_random node with +two unique attributes, "rng_available" and "rng_current". The +"rng_available" attribute lists the hardware-specific drivers +available, while "rng_current" lists the one which is currently +connected to /dev/hwrng. If your system has more than one +RNG available, you may change the one used by writing a name from +the list in "rng_available" into "rng_current". + +========================================================================== + + +Hardware driver for Intel/AMD/VIA Random Number Generators (RNG) + - Copyright 2000,2001 Jeff Garzik + - Copyright 2000,2001 Philipp Rumpf + + +About the Intel RNG hardware, from the firmware hub datasheet +============================================================= + +The Firmware Hub integrates a Random Number Generator (RNG) +using thermal noise generated from inherently random quantum +mechanical properties of silicon. When not generating new random +bits the RNG circuitry will enter a low power state. Intel will +provide a binary software driver to give third party software +access to our RNG for use as a security feature. At this time, +the RNG is only to be used with a system in an OS-present state. + +Intel RNG Driver notes +====================== + +FIXME: support poll(2) + +.. note:: + + request_mem_region was removed, for three reasons: + + 1) Only one RNG is supported by this driver; + 2) The location used by the RNG is a fixed location in + MMIO-addressable memory; + 3) users with properly working BIOS e820 handling will always + have the region in which the RNG is located reserved, so + request_mem_region calls always fail for proper setups. + However, for people who use mem=XX, BIOS e820 information is + **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can + succeed. + +Driver details +============== + +Based on: + Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet + May 1999 Order Number: 290658-002 R + +Intel 82802 Firmware Hub: + Random Number Generator + Programmer's Reference Manual + December 1999 Order Number: 298029-001 R + +Intel 82802 Firmware HUB Random Number Generator Driver + Copyright (c) 2000 Matt Sottek + +Special thanks to Matt Sottek. I did the "guts", he +did the "brains" and all the testing. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index a5fdb1a846ce..4e98f5596da0 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -85,8 +85,25 @@ configure specific aspects of kernel behavior to your liking. perf-security acpi/index aoe/index + btmrvl + clearing-warn-once + cpu-load + cputopology device-mapper/index + efi-stub + highuid + hw_random + iostats + kernel-per-CPU-kthreads laptops/index + lcd-panel-cgram + ldm + lockup-watchdogs + numastat + pnp + rtc + svga + video-output .. only:: subproject and html diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst new file mode 100644 index 000000000000..5d63b18bd6d1 --- /dev/null +++ b/Documentation/admin-guide/iostats.rst @@ -0,0 +1,197 @@ +===================== +I/O statistics fields +===================== + +Since 2.4.20 (and some versions before, with patches), and 2.5.45, +more extensive disk statistics have been introduced to help measure disk +activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do +the work for you, but in case you are interested in creating your own +tools, the fields are explained here. + +In 2.4 now, the information is found as additional fields in +``/proc/partitions``. In 2.6 and upper, the same information is found in two +places: one is in the file ``/proc/diskstats``, and the other is within +the sysfs file system, which must be mounted in order to obtain +the information. Throughout this document we'll assume that sysfs +is mounted on ``/sys``, although of course it may be mounted anywhere. +Both ``/proc/diskstats`` and sysfs use the same source for the information +and so should not differ. + +Here are examples of these different formats:: + + 2.4: + 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 + + 2.6+ sysfs: + 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 35486 38030 38030 38030 + + 2.6+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 3 1 hda1 35486 38030 38030 38030 + + 4.18+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 + +On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have +a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. + +The advantage of one over the other is that the sysfs choice works well +if you are watching a known, small set of disks. ``/proc/diskstats`` may +be a better choice if you are watching a large number of disks because +you'll avoid the overhead of 50, 100, or 500 or more opens/closes with +each snapshot of your disk statistics. + +In 2.4, the statistics fields are those after the device name. In +the above example, the first field of statistics would be 446216. +By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll +find just the eleven fields, beginning with 446216. If you look at +``/proc/diskstats``, the eleven fields will be preceded by the major and +minor device numbers, and device name. Each of these formats provides +eleven fields of statistics, each meaning exactly the same things. +All fields except field 9 are cumulative since boot. Field 9 should +go to zero as I/Os complete; all others only increase (unless they +overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long +(native word size) numbers, and on a very busy or long-lived system they +may wrap. Applications should be prepared to deal with that; unless +your observations are measured in large numbers of minutes or hours, +they should not wrap twice before you notice them. + +Each set of stats only applies to the indicated device; if you want +system-wide stats you'll have to find all the devices and sum them all up. + +Field 1 -- # of reads completed + This is the total number of reads completed successfully. + +Field 2 -- # of reads merged, field 6 -- # of writes merged + Reads and writes which are adjacent to each other may be merged for + efficiency. Thus two 4K reads may become one 8K read before it is + ultimately handed to the disk, and so it will be counted (and queued) + as only one I/O. This field lets you know how often this was done. + +Field 3 -- # of sectors read + This is the total number of sectors read successfully. + +Field 4 -- # of milliseconds spent reading + This is the total number of milliseconds spent by all reads (as + measured from __make_request() to end_that_request_last()). + +Field 5 -- # of writes completed + This is the total number of writes completed successfully. + +Field 6 -- # of writes merged + See the description of field 2. + +Field 7 -- # of sectors written + This is the total number of sectors written successfully. + +Field 8 -- # of milliseconds spent writing + This is the total number of milliseconds spent by all writes (as + measured from __make_request() to end_that_request_last()). + +Field 9 -- # of I/Os currently in progress + The only field that should go to zero. Incremented as requests are + given to appropriate struct request_queue and decremented as they finish. + +Field 10 -- # of milliseconds spent doing I/Os + This field increases so long as field 9 is nonzero. + + Since 5.0 this field counts jiffies when at least one request was + started or completed. If request runs more than 2 jiffies then some + I/O time will not be accounted unless there are other requests. + +Field 11 -- weighted # of milliseconds spent doing I/Os + This field is incremented at each I/O start, I/O completion, I/O + merge, or read of these stats by the number of I/Os in progress + (field 9) times the number of milliseconds spent doing I/O since the + last update of this field. This can provide an easy measure of both + I/O completion time and the backlog that may be accumulating. + +Field 12 -- # of discards completed + This is the total number of discards completed successfully. + +Field 13 -- # of discards merged + See the description of field 2 + +Field 14 -- # of sectors discarded + This is the total number of sectors discarded successfully. + +Field 15 -- # of milliseconds spent discarding + This is the total number of milliseconds spent by all discards (as + measured from __make_request() to end_that_request_last()). + +To avoid introducing performance bottlenecks, no locks are held while +modifying these counters. This implies that minor inaccuracies may be +introduced when changes collide, so (for instance) adding up all the +read I/Os issued per partition should equal those made to the disks ... +but due to the lack of locking it may only be very close. + +In 2.6+, there are counters for each CPU, which make the lack of locking +almost a non-issue. When the statistics are read, the per-CPU counters +are summed (possibly overflowing the unsigned long variable they are +summed to) and the result given to the user. There is no convenient +user interface for accessing the per-CPU counters themselves. + +Disks vs Partitions +------------------- + +There were significant changes between 2.4 and 2.6+ in the I/O subsystem. +As a result, some statistic information disappeared. The translation from +a disk address relative to a partition to the disk address relative to +the host disk happens much earlier. All merges and timings now happen +at the disk level rather than at both the disk and partition level as +in 2.4. Consequently, you'll see a different statistics output on 2.6+ for +partitions from that for disks. There are only *four* fields available +for partitions on 2.6+ machines. This is reflected in the examples above. + +Field 1 -- # of reads issued + This is the total number of reads issued to this partition. + +Field 2 -- # of sectors read + This is the total number of sectors requested to be read from this + partition. + +Field 3 -- # of writes issued + This is the total number of writes issued to this partition. + +Field 4 -- # of sectors written + This is the total number of sectors requested to be written to + this partition. + +Note that since the address is translated to a disk-relative one, and no +record of the partition-relative address is kept, the subsequent success +or failure of the read cannot be attributed to the partition. In other +words, the number of reads for partitions is counted slightly before time +of queuing for partitions, and at completion for whole disks. This is +a subtle distinction that is probably uninteresting for most cases. + +More significant is the error induced by counting the numbers of +reads/writes before merges for partitions and after for disks. Since a +typical workload usually contains a lot of successive and adjacent requests, +the number of reads/writes issued can be several times higher than the +number of reads/writes completed. + +In 2.6.25, the full statistic set is again available for partitions and +disk and partition statistics are consistent again. Since we still don't +keep record of the partition-relative address, an operation is attributed to +the partition which contains the first sector of the request after the +eventual merges. As requests can be merged across partition, this could lead +to some (probably insignificant) inaccuracy. + +Additional notes +---------------- + +In 2.6+, sysfs is not mounted by default. If your distribution of +Linux hasn't added it already, here's the line you'll want to add to +your ``/etc/fstab``:: + + none /sys sysfs defaults 0 0 + + +In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they +appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in +``/proc/stat`` take a very different format from those in ``/proc/partitions`` +(see proc(5), if your system has it.) + +-- ricklind@us.ibm.com diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a571a67e0c85..19b1e3bef56c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5066,7 +5066,7 @@ vga= [BOOT,X86-32] Select a particular video mode See Documentation/x86/boot.rst and - Documentation/svga.txt. + Documentation/admin-guide/svga.rst. Use vga=ask for menu. This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst new file mode 100644 index 000000000000..4f18456dd3b1 --- /dev/null +++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst @@ -0,0 +1,356 @@ +========================================== +Reducing OS jitter due to per-cpu kthreads +========================================== + +This document lists per-CPU kthreads in the Linux kernel and presents +options to control their OS jitter. Note that non-per-CPU kthreads are +not listed here. To reduce OS jitter from non-per-CPU kthreads, bind +them to a "housekeeping" CPU dedicated to such work. + +References +========== + +- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. + +- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. + +- man taskset: Using the taskset command to bind tasks to sets + of CPUs. + +- man sched_setaffinity: Using the sched_setaffinity() system + call to bind tasks to sets of CPUs. + +- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, + writing "0" to offline and "1" to online. + +- In order to locate kernel-generated OS jitter on CPU N: + + cd /sys/kernel/debug/tracing + echo 1 > max_graph_depth # Increase the "1" for more detail + echo function_graph > current_tracer + # run workload + cat per_cpu/cpuN/trace + +kthreads +======== + +Name: + ehca_comp/%u + +Purpose: + Periodically process Infiniband-related work. + +To reduce its OS jitter, do any of the following: + +1. Don't use eHCA Infiniband hardware, instead choosing hardware + that does not require per-CPU kthreads. This will prevent these + kthreads from being created in the first place. (This will + work for most people, as this hardware, though important, is + relatively old and is produced in relatively low unit volumes.) +2. Do all eHCA-Infiniband-related work on other CPUs, including + interrupts. +3. Rework the eHCA driver so that its per-CPU kthreads are + provisioned only on selected CPUs. + + +Name: + irq/%d-%s + +Purpose: + Handle threaded interrupts. + +To reduce its OS jitter, do the following: + +1. Use irq affinity to force the irq threads to execute on + some other CPU. + +Name: + kcmtpd_ctr_%d + +Purpose: + Handle Bluetooth work. + +To reduce its OS jitter, do one of the following: + +1. Don't use Bluetooth, in which case these kthreads won't be + created in the first place. +2. Use irq affinity to force Bluetooth-related interrupts to + occur on some other CPU and furthermore initiate all + Bluetooth activity on some other CPU. + +Name: + ksoftirqd/%u + +Purpose: + Execute softirq handlers when threaded or when under heavy load. + +To reduce its OS jitter, each softirq vector must be handled +separately as follows: + +TIMER_SOFTIRQ +------------- + +Do all of the following: + +1. To the extent possible, keep the CPU out of the kernel when it + is non-idle, for example, by avoiding system calls and by forcing + both kernel threads and interrupts to execute elsewhere. +2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force + the CPU offline, then bring it back online. This forces + recurring timers to migrate elsewhere. If you are concerned + with multiple CPUs, force them all offline before bringing the + first one back online. Once you have onlined the CPUs in question, + do not offline any other CPUs, because doing so could force the + timer back onto one of the CPUs in question. + +NET_TX_SOFTIRQ and NET_RX_SOFTIRQ +--------------------------------- + +Do all of the following: + +1. Force networking interrupts onto other CPUs. +2. Initiate any network I/O on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +BLOCK_SOFTIRQ +------------- + +Do all of the following: + +1. Force block-device interrupts onto some other CPU. +2. Initiate any block I/O on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +IRQ_POLL_SOFTIRQ +---------------- + +Do all of the following: + +1. Force block-device interrupts onto some other CPU. +2. Initiate any block I/O and block-I/O polling on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +TASKLET_SOFTIRQ +--------------- + +Do one or more of the following: + +1. Avoid use of drivers that use tasklets. (Such drivers will contain + calls to things like tasklet_schedule().) +2. Convert all drivers that you must use from tasklets to workqueues. +3. Force interrupts for drivers using tasklets onto other CPUs, + and also do I/O involving these drivers on other CPUs. + +SCHED_SOFTIRQ +------------- + +Do all of the following: + +1. Avoid sending scheduler IPIs to the CPU to be de-jittered, + for example, ensure that at most one runnable kthread is present + on that CPU. If a thread that expects to run on the de-jittered + CPU awakens, the scheduler will send an IPI that can result in + a subsequent SCHED_SOFTIRQ. +2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered + is marked as an adaptive-ticks CPU using the "nohz_full=" + boot parameter. This reduces the number of scheduler-clock + interrupts that the de-jittered CPU receives, minimizing its + chances of being selected to do the load balancing work that + runs in SCHED_SOFTIRQ context. +3. To the extent possible, keep the CPU out of the kernel when it + is non-idle, for example, by avoiding system calls and by + forcing both kernel threads and interrupts to execute elsewhere. + This further reduces the number of scheduler-clock interrupts + received by the de-jittered CPU. + +HRTIMER_SOFTIRQ +--------------- + +Do all of the following: + +1. To the extent possible, keep the CPU out of the kernel when it + is non-idle. For example, avoid system calls and force both + kernel threads and interrupts to execute elsewhere. +2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the + CPU offline, then bring it back online. This forces recurring + timers to migrate elsewhere. If you are concerned with multiple + CPUs, force them all offline before bringing the first one + back online. Once you have onlined the CPUs in question, do not + offline any other CPUs, because doing so could force the timer + back onto one of the CPUs in question. + +RCU_SOFTIRQ +----------- + +Do at least one of the following: + +1. Offload callbacks and keep the CPU in either dyntick-idle or + adaptive-ticks state by doing all of the following: + + a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be + de-jittered is marked as an adaptive-ticks CPU using the + "nohz_full=" boot parameter. Bind the rcuo kthreads to + housekeeping CPUs, which can tolerate OS jitter. + b. To the extent possible, keep the CPU out of the kernel + when it is non-idle, for example, by avoiding system + calls and by forcing both kernel threads and interrupts + to execute elsewhere. + +2. Enable RCU to do its processing remotely via dyntick-idle by + doing all of the following: + + a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. + b. Ensure that the CPU goes idle frequently, allowing other + CPUs to detect that it has passed through an RCU quiescent + state. If the kernel is built with CONFIG_NO_HZ_FULL=y, + userspace execution also allows other CPUs to detect that + the CPU in question has passed through a quiescent state. + c. To the extent possible, keep the CPU out of the kernel + when it is non-idle, for example, by avoiding system + calls and by forcing both kernel threads and interrupts + to execute elsewhere. + +Name: + kworker/%u:%d%s (cpu, id, priority) + +Purpose: + Execute workqueue requests + +To reduce its OS jitter, do any of the following: + +1. Run your workload at a real-time priority, which will allow + preempting the kworker daemons. +2. A given workqueue can be made visible in the sysfs filesystem + by passing the WQ_SYSFS to that workqueue's alloc_workqueue(). + Such a workqueue can be confined to a given subset of the + CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs + files. The set of WQ_SYSFS workqueues can be displayed using + "ls sys/devices/virtual/workqueue". That said, the workqueues + maintainer would like to caution people against indiscriminately + sprinkling WQ_SYSFS across all the workqueues. The reason for + caution is that it is easy to add WQ_SYSFS, but because sysfs is + part of the formal user/kernel API, it can be nearly impossible + to remove it, even if its addition was a mistake. +3. Do any of the following needed to avoid jitter that your + application cannot tolerate: + + a. Build your kernel with CONFIG_SLUB=y rather than + CONFIG_SLAB=y, thus avoiding the slab allocator's periodic + use of each CPU's workqueues to run its cache_reap() + function. + b. Avoid using oprofile, thus avoiding OS jitter from + wq_sync_buffer(). + c. Limit your CPU frequency so that a CPU-frequency + governor is not required, possibly enlisting the aid of + special heatsinks or other cooling technologies. If done + correctly, and if you CPU architecture permits, you should + be able to build your kernel with CONFIG_CPU_FREQ=n to + avoid the CPU-frequency governor periodically running + on each CPU, including cs_dbs_timer() and od_dbs_timer(). + + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + d. As of v3.18, Christoph Lameter's on-demand vmstat workers + commit prevents OS jitter due to vmstat_update() on + CONFIG_SMP=y systems. Before v3.18, is not possible + to entirely get rid of the OS jitter, but you can + decrease its frequency by writing a large value to + /proc/sys/vm/stat_interval. The default value is HZ, + for an interval of one second. Of course, larger values + will make your virtual-memory statistics update more + slowly. Of course, you can also run your workload at + a real-time priority, thus preempting vmstat_update(), + but if your workload is CPU-bound, this is a bad idea. + However, there is an RFC patch from Christoph Lameter + (based on an earlier one from Gilad Ben-Yossef) that + reduces or even eliminates vmstat overhead for some + workloads at https://lkml.org/lkml/2013/9/4/379. + e. Boot with "elevator=noop" to avoid workqueue use by + the block layer. + f. If running on high-end powerpc servers, build with + CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS + daemon from running on each CPU every second or so. + (This will require editing Kconfig files and will defeat + this platform's RAS functionality.) This avoids jitter + due to the rtas_event_scan() function. + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + g. If running on Cell Processor, build your kernel with + CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from + spu_gov_work(). + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + h. If running on PowerMAC, build your kernel with + CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, + avoiding OS jitter from rackmeter_do_timer(). + +Name: + rcuc/%u + +Purpose: + Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. + +To reduce its OS jitter, do at least one of the following: + +1. Build the kernel with CONFIG_PREEMPT=n. This prevents these + kthreads from being created in the first place, and also obviates + the need for RCU priority boosting. This approach is feasible + for workloads that do not require high degrees of responsiveness. +2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these + kthreads from being created in the first place. This approach + is feasible only if your workload never requires RCU priority + boosting, for example, if you ensure frequent idle time on all + CPUs that might execute within the kernel. +3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= + boot parameter offloading RCU callbacks from all CPUs susceptible + to OS jitter. This approach prevents the rcuc/%u kthreads from + having any work to do, so that they are never awakened. +4. Ensure that the CPU never enters the kernel, and, in particular, + avoid initiating any CPU hotplug operations on this CPU. This is + another way of preventing any callbacks from being queued on the + CPU, again preventing the rcuc/%u kthreads from having any work + to do. + +Name: + rcuop/%d and rcuos/%d + +Purpose: + Offload RCU callbacks from the corresponding CPU. + +To reduce its OS jitter, do at least one of the following: + +1. Use affinity, cgroups, or other mechanism to force these kthreads + to execute on some other CPU. +2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these + kthreads from being created in the first place. However, please + note that this will not eliminate OS jitter, but will instead + shift it to RCU_SOFTIRQ. + +Name: + watchdog/%u + +Purpose: + Detect software lockups on each CPU. + +To reduce its OS jitter, do at least one of the following: + +1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these + kthreads from being created in the first place. +2. Boot with "nosoftlockup=0", which will also prevent these kthreads + from being created. Other related watchdog and softlockup boot + parameters may be found in Documentation/admin-guide/kernel-parameters.rst + and Documentation/watchdog/watchdog-parameters.rst. +3. Echo a zero to /proc/sys/kernel/watchdog to disable the + watchdog timer. +4. Echo a large number of /proc/sys/kernel/watchdog_thresh in + order to reduce the frequency of OS jitter due to the watchdog + timer down to a level that is acceptable for your workload. diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst new file mode 100644 index 000000000000..a3eb00c62f53 --- /dev/null +++ b/Documentation/admin-guide/lcd-panel-cgram.rst @@ -0,0 +1,27 @@ +====================================== +Parallel port LCD/Keypad Panel support +====================================== + +Some LCDs allow you to define up to 8 characters, mapped to ASCII +characters 0 to 7. The escape code to define a new character is +'\e[LG' followed by one digit from 0 to 7, representing the character +number, and up to 8 couples of hex digits terminated by a semi-colon +(';'). Each couple of digits represents a line, with 1-bits for each +illuminated pixel with LSB on the right. Lines are numbered from the +top of the character to the bottom. On a 5x7 matrix, only the 5 lower +bits of the 7 first bytes are used for each character. If the string +is incomplete, only complete lines will be redefined. Here are some +examples:: + + printf "\e[LG0010101050D1F0C04;" => 0 = [enter] + printf "\e[LG1040E1F0000000000;" => 1 = [up] + printf "\e[LG2000000001F0E0400;" => 2 = [down] + printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down] + printf "\e[LG40002060E1E0E0602;" => 4 = [left] + printf "\e[LG500080C0E0F0E0C08;" => 5 = [right] + printf "\e[LG60016051516141400;" => 6 = "IP" + + printf "\e[LG00103071F1F070301;" => big speaker + printf "\e[LG00002061E1E060200;" => small speaker + +Willy diff --git a/Documentation/admin-guide/ldm.rst b/Documentation/admin-guide/ldm.rst new file mode 100644 index 000000000000..12c571368e73 --- /dev/null +++ b/Documentation/admin-guide/ldm.rst @@ -0,0 +1,121 @@ +========================================== +LDM - Logical Disk Manager (Dynamic Disks) +========================================== + +:Author: Originally Written by FlatCap - Richard Russon . +:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista. + +Overview +-------- + +Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete +replacement for the MSDOS style partitions. It stores its information in a +1MiB journalled database at the end of the physical disk. The size of +partitions is limited only by disk space. The maximum number of partitions is +nearly 2000. + +Any partitions created under the LDM are called "Dynamic Disks". There are no +longer any primary or extended partitions. Normal MSDOS style partitions are +now known as Basic Disks. + +If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use +Dynamic Disks. The journalling allows Windows to make changes to these +partitions and filesystems without the need to reboot. + +Once the LDM driver has divided up the disk, you can use the MD driver to +assemble any multi-partition volumes, e.g. Stripes, RAID5. + +To prevent legacy applications from repartitioning the disk, the LDM creates a +dummy MSDOS partition containing one disk-sized partition. This is what is +supported with the Linux LDM driver. + +A newer approach that has been implemented with Vista is to put LDM on top of a +GPT label disk. This is not supported by the Linux LDM driver yet. + + +Example +------- + +Below we have a 50MiB disk, divided into seven partitions. + +.. note:: + + The missing 1MiB at the end of the disk is where the LDM database is + stored. + ++-------++--------------+---------+-----++--------------+---------+----+ +|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB| ++=======++==============+=========+=====++==============+=========+====+ +|hda || 0 | 0 | 0 || 52428800 | 102400 | 50| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13| ++-------++--------------+---------+-----++--------------+---------+----+ + +The LDM Database may not store the partitions in the order that they appear on +disk, but the driver will sort them. + +When Linux boots, you will see something like:: + + hda: 102400 sectors w/32KiB Cache, CHS=50/64/32 + hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7 + + +Compiling LDM Support +--------------------- + +To enable LDM, choose the following two options: + + - "Advanced partition selection" CONFIG_PARTITION_ADVANCED + - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION + +If you believe the driver isn't working as it should, you can enable the extra +debugging code. This will produce a LOT of output. The option is: + + - "Windows LDM extra logging" CONFIG_LDM_DEBUG + +N.B. The partition code cannot be compiled as a module. + +As with all the partition code, if the driver doesn't see signs of its type of +partition, it will pass control to another driver, so there is no harm in +enabling it. + +If you have Dynamic Disks but don't enable the driver, then all you will see +is a dummy MSDOS partition filling the whole disk. You won't be able to mount +any of the volumes on the disk. + + +Booting +------- + +If you enable LDM support, then lilo is capable of booting from any of the +discovered partitions. However, grub does not understand the LDM partitioning +and cannot boot from a Dynamic Disk. + + +More Documentation +------------------ + +There is an Overview of the LDM together with complete Technical Documentation. +It is available for download. + + http://www.linux-ntfs.org/ + +If you have any LDM questions that aren't answered in the documentation, email +me. + +Cheers, + FlatCap - Richard Russon + ldm@flatcap.org + diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst new file mode 100644 index 000000000000..290840c160af --- /dev/null +++ b/Documentation/admin-guide/lockup-watchdogs.rst @@ -0,0 +1,83 @@ +=============================================================== +Softlockup detector and hardlockup detector (aka nmi_watchdog) +=============================================================== + +The Linux kernel can act as a watchdog to detect both soft and hard +lockups. + +A 'softlockup' is defined as a bug that causes the kernel to loop in +kernel mode for more than 20 seconds (see "Implementation" below for +details), without giving other tasks a chance to run. The current +stack trace is displayed upon detection and, by default, the system +will stay locked up. Alternatively, the kernel can be configured to +panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, +"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for +details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are +provided for this. + +A 'hardlockup' is defined as a bug that causes the CPU to loop in +kernel mode for more than 10 seconds (see "Implementation" below for +details), without letting other interrupts have a chance to run. +Similarly to the softlockup case, the current stack trace is displayed +upon detection and the system will stay locked up unless the default +behavior is changed, which can be done through a sysctl, +'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", +and a kernel parameter, "nmi_watchdog" +(see "Documentation/admin-guide/kernel-parameters.rst" for details). + +The panic option can be used in combination with panic_timeout (this +timeout is set through the confusingly named "kernel.panic" sysctl), +to cause the system to reboot automatically after a specified amount +of time. + +Implementation +============== + +The soft and hard lockup detectors are built on top of the hrtimer and +perf subsystems, respectively. A direct consequence of this is that, +in principle, they should work in any architecture where these +subsystems are present. + +A periodic hrtimer runs to generate interrupts and kick the watchdog +task. An NMI perf event is generated every "watchdog_thresh" +(compile-time initialized to 10 and configurable through sysctl of the +same name) seconds to check for hardlockups. If any CPU in the system +does not receive any hrtimer interrupt during that time the +'hardlockup detector' (the handler for the NMI perf event) will +generate a kernel warning or call panic, depending on the +configuration. + +The watchdog task is a high priority kernel thread that updates a +timestamp every time it is scheduled. If that timestamp is not updated +for 2*watchdog_thresh seconds (the softlockup threshold) the +'softlockup detector' (coded inside the hrtimer callback function) +will dump useful debug information to the system log, after which it +will call panic if it was instructed to do so or resume execution of +other kernel code. + +The period of the hrtimer is 2*watchdog_thresh/5, which means it has +two or three chances to generate an interrupt before the hardlockup +detector kicks in. + +As explained above, a kernel knob is provided that allows +administrators to configure the period of the hrtimer and the perf +event. The right value for a particular environment is a trade-off +between fast response to lockups and detection overhead. + +By default, the watchdog runs on all online cores. However, on a +kernel configured with NO_HZ_FULL, by default the watchdog runs only +on the housekeeping cores, not the cores specified in the "nohz_full" +boot argument. If we allowed the watchdog to run by default on +the "nohz_full" cores, we would have to run timer ticks to activate +the scheduler, which would prevent the "nohz_full" functionality +from protecting the user code on those cores from the kernel. +Of course, disabling it by default on the nohz_full cores means that +when those cores do enter the kernel, by default we will not be +able to detect if they lock up. However, allowing the watchdog +to continue to run on the housekeeping (non-tickless) cores means +that we will continue to detect lockups properly on those cores. + +In either case, the set of cores excluded from running the watchdog +may be adjusted via the kernel.watchdog_cpumask sysctl. For +nohz_full cores, this may be useful for debugging a case where the +kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst new file mode 100644 index 000000000000..4e06ffabd78a --- /dev/null +++ b/Documentation/admin-guide/mm/cma_debugfs.rst @@ -0,0 +1,25 @@ +===================== +CMA Debugfs Interface +===================== + +The CMA debugfs interface is useful to retrieve basic information out of the +different CMA areas and to test allocation/release in each of the areas. + +Each CMA zone represents a directory under /cma/, indexed by the +kernel's CMA index. So the first CMA zone would be: + + /cma/cma-0 + +The structure of the files created under that directory is as follows: + + - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. + - [RO] count: Amount of memory in the CMA area. + - [RO] order_per_bit: Order of pages represented by one bit. + - [RO] bitmap: The bitmap of page states in the zone. + - [WO] alloc: Allocate N pages from that CMA area. For example:: + + echo 5 > /cma/cma-2/alloc + +would try to allocate 5 pages from the cma-2 area. + + - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index 5f61a6c429e0..11db46448354 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -26,6 +26,7 @@ the Linux memory management. :maxdepth: 1 concepts + cma_debugfs hugetlbpage idle_page_tracking ksm diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst new file mode 100644 index 000000000000..aaf1667489f8 --- /dev/null +++ b/Documentation/admin-guide/numastat.rst @@ -0,0 +1,30 @@ +=============================== +Numa policy hit/miss statistics +=============================== + +/sys/devices/system/node/node*/numastat + +All units are pages. Hugepages have separate counters. + +=============== ============================================================ +numa_hit A process wanted to allocate memory from this node, + and succeeded. + +numa_miss A process wanted to allocate memory from another node, + but ended up with memory from this node. + +numa_foreign A process wanted to allocate on this node, + but ended up with memory from another one. + +local_node A process ran on this node and got memory from it. + +other_node A process ran on this node and got memory from another node. + +interleave_hit Interleaving wanted to allocate from this node + and succeeded. +=============== ============================================================ + +For easier reading you can use the numastat utility from the numactl package +(http://oss.sgi.com/projects/libnuma/). Note that it only works +well right now on machines with a small number of CPUs. + diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst new file mode 100644 index 000000000000..bab2d10631f0 --- /dev/null +++ b/Documentation/admin-guide/pnp.rst @@ -0,0 +1,292 @@ +================================= +Linux Plug and Play Documentation +================================= + +:Author: Adam Belay +:Last updated: Oct. 16, 2002 + + +Overview +-------- + +Plug and Play provides a means of detecting and setting resources for legacy or +otherwise unconfigurable devices. The Linux Plug and Play Layer provides these +services to compatible drivers. + + +The User Interface +------------------ + +The Linux Plug and Play user interface provides a means to activate PnP devices +for legacy and user level drivers that do not support Linux Plug and Play. The +user interface is integrated into sysfs. + +In addition to the standard sysfs file the following are created in each +device's directory: +- id - displays a list of support EISA IDs +- options - displays possible resource configurations +- resources - displays currently allocated resources and allows resource changes + +activating a device +^^^^^^^^^^^^^^^^^^^ + +:: + + # echo "auto" > resources + +this will invoke the automatic resource config system to activate the device + +manually activating a device +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + # echo "manual " > resources + + - the configuration number + - static or dynamic + static = for next boot + dynamic = now + +disabling a device +^^^^^^^^^^^^^^^^^^ + +:: + + # echo "disable" > resources + + +EXAMPLE: + +Suppose you need to activate the floppy disk controller. + +1. change to the proper directory, in my case it is + /driver/bus/pnp/devices/00:0f:: + + # cd /driver/bus/pnp/devices/00:0f + # cat name + PC standard floppy disk controller + +2. check if the device is already active:: + + # cat resources + DISABLED + + - Notice the string "DISABLED". This means the device is not active. + +3. check the device's possible configurations (optional):: + + # cat options + Dependent: 01 - Priority acceptable + port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding + port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding + irq 6 + dma 2 8-bit compatible + Dependent: 02 - Priority acceptable + port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding + port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding + irq 6 + dma 2 8-bit compatible + +4. now activate the device:: + + # echo "auto" > resources + +5. finally check if the device is active:: + + # cat resources + io 0x3f0-0x3f5 + io 0x3f7-0x3f7 + irq 6 + dma 2 + +also there are a series of kernel parameters:: + + pnp_reserve_irq=irq1[,irq2] .... + pnp_reserve_dma=dma1[,dma2] .... + pnp_reserve_io=io1,size1[,io2,size2] .... + pnp_reserve_mem=mem1,size1[,mem2,size2] .... + + + +The Unified Plug and Play Layer +------------------------------- + +All Plug and Play drivers, protocols, and services meet at a central location +called the Plug and Play Layer. This layer is responsible for the exchange of +information between PnP drivers and PnP protocols. Thus it automatically +forwards commands to the proper protocol. This makes writing PnP drivers +significantly easier. + +The following functions are available from the Plug and Play Layer: + +pnp_get_protocol + increments the number of uses by one + +pnp_put_protocol + deincrements the number of uses by one + +pnp_register_protocol + use this to register a new PnP protocol + +pnp_unregister_protocol + use this function to remove a PnP protocol from the Plug and Play Layer + +pnp_register_driver + adds a PnP driver to the Plug and Play Layer + + this includes driver model integration + returns zero for success or a negative error number for failure; count + calls to the .add() method if you need to know how many devices bind to + the driver + +pnp_unregister_driver + removes a PnP driver from the Plug and Play Layer + + + +Plug and Play Protocols +----------------------- + +This section contains information for PnP protocol developers. + +The following Protocols are currently available in the computing world: + +- PNPBIOS: + used for system devices such as serial and parallel ports. +- ISAPNP: + provides PnP support for the ISA bus +- ACPI: + among its many uses, ACPI provides information about system level + devices. + +It is meant to replace the PNPBIOS. It is not currently supported by Linux +Plug and Play but it is planned to be in the near future. + + +Requirements for a Linux PnP protocol: +1. the protocol must use EISA IDs +2. the protocol must inform the PnP Layer of a device's current configuration + +- the ability to set resources is optional but preferred. + +The following are PnP protocol related functions: + +pnp_add_device + use this function to add a PnP device to the PnP layer + + only call this function when all wanted values are set in the pnp_dev + structure + +pnp_init_device + call this to initialize the PnP structure + +pnp_remove_device + call this to remove a device from the Plug and Play Layer. + it will fail if the device is still in use. + automatically will free mem used by the device and related structures + +pnp_add_id + adds an EISA ID to the list of supported IDs for the specified device + +For more information consult the source of a protocol such as +/drivers/pnp/pnpbios/core.c. + + + +Linux Plug and Play Drivers +--------------------------- + +This section contains information for Linux PnP driver developers. + +The New Way +^^^^^^^^^^^ + +1. first make a list of supported EISA IDS + + ex:: + + static const struct pnp_id pnp_dev_table[] = { + /* Standard LPT Printer Port */ + {.id = "PNP0400", .driver_data = 0}, + /* ECP Printer Port */ + {.id = "PNP0401", .driver_data = 0}, + {.id = ""} + }; + + Please note that the character 'X' can be used as a wild card in the function + portion (last four characters). + + ex:: + + /* Unknown PnP modems */ + { "PNPCXXX", UNKNOWN_DEV }, + + Supported PnP card IDs can optionally be defined. + ex:: + + static const struct pnp_id pnp_card_table[] = { + { "ANYDEVS", 0 }, + { "", 0 } + }; + +2. Optionally define probe and remove functions. It may make sense not to + define these functions if the driver already has a reliable method of detecting + the resources, such as the parport_pc driver. + + ex:: + + static int + serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const + struct pnp_id *dev_id) + { + . . . + + ex:: + + static void serial_pnp_remove(struct pnp_dev * dev) + { + . . . + + consult /drivers/serial/8250_pnp.c for more information. + +3. create a driver structure + + ex:: + + static struct pnp_driver serial_pnp_driver = { + .name = "serial", + .card_id_table = pnp_card_table, + .id_table = pnp_dev_table, + .probe = serial_pnp_probe, + .remove = serial_pnp_remove, + }; + + * name and id_table cannot be NULL. + +4. register the driver + + ex:: + + static int __init serial8250_pnp_init(void) + { + return pnp_register_driver(&serial_pnp_driver); + } + +The Old Way +^^^^^^^^^^^ + +A series of compatibility functions have been created to make it easy to convert +ISAPNP drivers. They should serve as a temporary solution only. + +They are as follows:: + + struct pnp_card *pnp_find_card(unsigned short vendor, + unsigned short device, + struct pnp_card *from) + + struct pnp_dev *pnp_find_dev(struct pnp_card *card, + unsigned short vendor, + unsigned short function, + struct pnp_dev *from) + diff --git a/Documentation/admin-guide/rtc.rst b/Documentation/admin-guide/rtc.rst new file mode 100644 index 000000000000..688c95b11919 --- /dev/null +++ b/Documentation/admin-guide/rtc.rst @@ -0,0 +1,140 @@ +======================================= +Real Time Clock (RTC) Drivers for Linux +======================================= + +When Linux developers talk about a "Real Time Clock", they usually mean +something that tracks wall clock time and is battery backed so that it +works even with system power off. Such clocks will normally not track +the local time zone or daylight savings time -- unless they dual boot +with MS-Windows -- but will instead be set to Coordinated Universal Time +(UTC, formerly "Greenwich Mean Time"). + +The newest non-PC hardware tends to just count seconds, like the time(2) +system call reports, but RTCs also very commonly represent time using +the Gregorian calendar and 24 hour time, as reported by gmtime(3). + +Linux has two largely-compatible userspace RTC API families you may +need to know about: + + * /dev/rtc ... is the RTC provided by PC compatible systems, + so it's not very portable to non-x86 systems. + + * /dev/rtc0, /dev/rtc1 ... are part of a framework that's + supported by a wide variety of RTC chips on all systems. + +Programmers need to understand that the PC/AT functionality is not +always available, and some systems can do much more. That is, the +RTCs use the same API to make requests in both RTC frameworks (using +different filenames of course), but the hardware may not offer the +same functionality. For example, not every RTC is hooked up to an +IRQ, so they can't all issue alarms; and where standard PC RTCs can +only issue an alarm up to 24 hours in the future, other hardware may +be able to schedule one any time in the upcoming century. + + +Old PC/AT-Compatible driver: /dev/rtc +-------------------------------------- + +All PCs (even Alpha machines) have a Real Time Clock built into them. +Usually they are built into the chipset of the computer, but some may +actually have a Motorola MC146818 (or clone) on the board. This is the +clock that keeps the date and time while your computer is turned off. + +ACPI has standardized that MC146818 functionality, and extended it in +a few ways (enabling longer alarm periods, and wake-from-hibernate). +That functionality is NOT exposed in the old driver. + +However it can also be used to generate signals from a slow 2Hz to a +relatively fast 8192Hz, in increments of powers of two. These signals +are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is +for...) It can also function as a 24hr alarm, raising IRQ 8 when the +alarm goes off. The alarm can also be programmed to only check any +subset of the three programmable values, meaning that it could be set to +ring on the 30th second of the 30th minute of every hour, for example. +The clock can also be set to generate an interrupt upon every clock +update, thus generating a 1Hz signal. + +The interrupts are reported via /dev/rtc (major 10, minor 135, read only +character device) in the form of an unsigned long. The low byte contains +the type of interrupt (update-done, alarm-rang, or periodic) that was +raised, and the remaining bytes contain the number of interrupts since +the last read. Status information is reported through the pseudo-file +/proc/driver/rtc if the /proc filesystem was enabled. The driver has +built in locking so that only one process is allowed to have the /dev/rtc +interface open at a time. + +A user process can monitor these interrupts by doing a read(2) or a +select(2) on /dev/rtc -- either will block/stop the user process until +the next interrupt is received. This is useful for things like +reasonably high frequency data acquisition where one doesn't want to +burn up 100% CPU by polling gettimeofday etc. etc. + +At high frequencies, or under high loads, the user process should check +the number of interrupts received since the last read to determine if +there has been any interrupt "pileup" so to speak. Just for reference, a +typical 486-33 running a tight read loop on /dev/rtc will start to suffer +occasional interrupt pileup (i.e. > 1 IRQ event since last read) for +frequencies above 1024Hz. So you really should check the high bytes +of the value you read, especially at frequencies above that of the +normal timer interrupt, which is 100Hz. + +Programming and/or enabling interrupt frequencies greater than 64Hz is +only allowed by root. This is perhaps a bit conservative, but we don't want +an evil user generating lots of IRQs on a slow 386sx-16, where it might have +a negative impact on performance. This 64Hz limit can be changed by writing +a different value to /proc/sys/dev/rtc/max-user-freq. Note that the +interrupt handler is only a few lines of code to minimize any possibility +of this effect. + +Also, if the kernel time is synchronized with an external source, the +kernel will write the time back to the CMOS clock every 11 minutes. In +the process of doing this, the kernel briefly turns off RTC periodic +interrupts, so be aware of this if you are doing serious work. If you +don't synchronize the kernel time with an external source (via ntp or +whatever) then the kernel will keep its hands off the RTC, allowing you +exclusive access to the device for your applications. + +The alarm and/or interrupt frequency are programmed into the RTC via +various ioctl(2) calls as listed in ./include/linux/rtc.h +Rather than write 50 pages describing the ioctl() and so on, it is +perhaps more useful to include a small test program that demonstrates +how to use them, and demonstrates the features of the driver. This is +probably a lot more useful to people interested in writing applications +that will be using this driver. See the code at the end of this document. + +(The original /dev/rtc driver was written by Paul Gortmaker.) + + +New portable "RTC Class" drivers: /dev/rtcN +-------------------------------------------- + +Because Linux supports many non-ACPI and non-PC platforms, some of which +have more than one RTC style clock, it needed a more portable solution +than expecting a single battery-backed MC146818 clone on every system. +Accordingly, a new "RTC Class" framework has been defined. It offers +three different userspace interfaces: + + * /dev/rtcN ... much the same as the older /dev/rtc interface + + * /sys/class/rtc/rtcN ... sysfs attributes support readonly + access to some RTC attributes. + + * /proc/driver/rtc ... the system clock RTC may expose itself + using a procfs interface. If there is no RTC for the system clock, + rtc0 is used by default. More information is (currently) shown + here than through sysfs. + +The RTC Class framework supports a wide variety of RTCs, ranging from those +integrated into embeddable system-on-chip (SOC) processors to discrete chips +using I2C, SPI, or some other bus to communicate with the host CPU. There's +even support for PC-style RTCs ... including the features exposed on newer PCs +through ACPI. + +The new framework also removes the "one RTC per system" restriction. For +example, maybe the low-power battery-backed RTC is a discrete I2C chip, but +a high functionality RTC is integrated into the SOC. That system might read +the system clock from the discrete RTC, but use the integrated one for all +other tasks, because of its greater functionality. + +Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the +ioctl interface. diff --git a/Documentation/admin-guide/svga.rst b/Documentation/admin-guide/svga.rst new file mode 100644 index 000000000000..b6c2f9acca92 --- /dev/null +++ b/Documentation/admin-guide/svga.rst @@ -0,0 +1,249 @@ +.. include:: + +================================= +Video Mode Selection Support 2.13 +================================= + +:Copyright: |copy| 1995--1999 Martin Mares, + +Intro +~~~~~ + +This small document describes the "Video Mode Selection" feature which +allows the use of various special video modes supported by the video BIOS. Due +to usage of the BIOS, the selection is limited to boot time (before the +kernel decompression starts) and works only on 80X86 machines. + +.. note:: + + Short intro for the impatient: Just use vga=ask for the first time, + enter ``scan`` on the video mode prompt, pick the mode you want to use, + remember its mode ID (the four-digit hexadecimal number) and then + set the vga parameter to this number (converted to decimal first). + +The video mode to be used is selected by a kernel parameter which can be +specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..." +option of LILO (or some other boot loader you use) or by the "vidmode" utility +(present in standard Linux utility packages). You can use the following values +of this parameter:: + + NORMAL_VGA - Standard 80x25 mode available on all display adapters. + + EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA. + + ASK_VGA - Display a video mode menu upon startup (see below). + + 0..35 - Menu item number (when you have used the menu to view the list of + modes available on your adapter, you can specify the menu item you want + to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the + mode list displayed may vary as the kernel version changes, because the + modes are listed in a "first detected -- first displayed" manner. It's + better to use absolute mode numbers instead. + + 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below + for exact meaning of the ID). Warning: rdev and LILO don't support + hexadecimal numbers -- you have to convert it to decimal manually. + +Menu +~~~~ + +The ASK_VGA mode causes the kernel to offer a video mode menu upon +bootup. It displays a "Press to see video modes available, +to continue or wait 30 secs" message. If you press , you enter the +menu, if you press or wait 30 seconds, the kernel will boot up in +the standard 80x25 mode. + +The menu looks like:: + + Video adapter: + Mode: COLSxROWS: + 0 0F00 80x25 + 1 0F01 80x50 + 2 0F02 80x43 + 3 0F03 80x26 + .... + Enter mode number or ``scan``: + + tells what video adapter did Linux detect +-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA +with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection +of chipsets is turned off by default as it's inherently unreliable due to +absolutely insane PC design. + +"0 0F00 80x25" means that the first menu item (the menu items are numbered +from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the +next section for a description of mode IDs). + + encourages you to enter the item number or mode ID +you wish to set and press . If the computer complains something about +"Unknown mode ID", it is trying to tell you that it isn't possible to set such +a mode. It's also possible to press only which leaves the current mode. + +The mode list usually contains a few basic modes and some VESA modes. In +case your chipset has been detected, some chipset-specific modes are shown as +well (some of these might be missing or unusable on your machine as different +BIOSes are often shipped with the same card and the mode numbers depend purely +on the VGA BIOS). + +The modes displayed on the menu are partially sorted: The list starts with +the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and +80x43), local modes (if the local modes feature is enabled), VESA modes and +finally SVGA modes for the auto-detected adapter. + +If you are not happy with the mode list offered (e.g., if you think your card +is able to do more), you can enter "scan" instead of item number / mode ID. The +program will try to ask the BIOS for all possible video mode numbers and test +what happens then. The screen will be probably flashing wildly for some time and +strange noises will be heard from inside the monitor and so on and then, really +all consistent video modes supported by your BIOS will appear (plus maybe some +``ghost modes``). If you are afraid this could damage your monitor, don't use +this function. + +After scanning, the mode ordering is a bit different: the auto-detected SVGA +modes are not listed at all and the modes revealed by ``scan`` are shown before +all VESA modes. + +Mode IDs +~~~~~~~~ + +Because of the complexity of all the video stuff, the video mode IDs +used here are also a bit complex. A video mode ID is a 16-bit number usually +expressed in a hexadecimal notation (starting with "0x"). You can set a mode +by entering its mode directly if you know it even if it isn't shown on the menu. + +The ID numbers can be divided to those regions:: + + 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use + outside the menu as this can change from boot to boot (especially if you + have used the ``scan`` feature). + + 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number + (as presented to INT 10, function 00) increased by 0x0100. + + 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by + 0x0100. All VESA modes should be autodetected and shown on the menu. + + 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05. + (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60, + 945=132x28 for the standard Video7 BIOS) + + 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually + by modifying one of the standard modes). Currently available: + 0x0f00 standard 80x25, don't reset mode if already set (=FFFF) + 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA + 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font) + 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font) + 0x0f04 leave current video mode + 0x0f05 VGA 80x30 (480 scans, 16-point font) + 0x0f06 VGA 80x34 (480 scans, 14-point font) + 0x0f07 VGA 80x60 (480 scans, 8-point font) + 0x0f08 Graphics hack (see the VIDEO_GFX_HACK paragraph below) + + 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC" + form where RR is a number of rows and CC is a number of columns. + E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc. + This is the only fully portable way to refer to a non-standard mode, + but it relies on the mode being found and displayed on the menu + (remember that mode scanning is not done automatically). + + 0xff00 to 0xffff - aliases for backward compatibility: + 0xffff equivalent to 0x0f00 (standard 80x25) + 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50) + +If you add 0x8000 to the mode ID, the program will try to recalculate +vertical display timing according to mode parameters, which can be used to +eliminate some annoying bugs of certain VGA BIOSes (usually those used for +cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the +end of the display. + +Options +~~~~~~~ + +Build options for arch/x86/boot/* are selected by the kernel kconfig +utility and the kernel .config file. + +VIDEO_GFX_HACK - includes special hack for setting of graphics modes +to be used later by special drivers. +Allows to set _any_ BIOS mode including graphic ones and forcing specific +text screen resolution instead of peeking it from BIOS variables. Don't use +unless you think you know what you're doing. To activate this setup, use +mode number 0x0f08 (see the Mode IDs section above). + +Still doesn't work? +~~~~~~~~~~~~~~~~~~~ + +When the mode detection doesn't work (e.g., the mode list is incorrect or +the machine hangs instead of displaying the menu), try to switch off some of +the configuration options listed under "Options". If it fails, you can still use +your kernel with the video mode set directly via the kernel parameter. + +In either case, please send me a bug report containing what _exactly_ +happens and how do the configuration switches affect the behaviour of the bug. + +If you start Linux from M$-DOS, you might also use some DOS tools for +video mode setting. In this case, you must specify the 0x0f04 mode ("leave +current settings") to Linux, because if you don't and you use any non-standard +mode, Linux will switch to 80x25 automatically. + +If you set some extended mode and there's one or more extra lines on the +bottom of the display containing already scrolled-out text, your VGA BIOS +contains the most common video BIOS bug called "incorrect vertical display +end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately, +this must be done manually -- no autodetection mechanisms are available. + +History +~~~~~~~ + +=============== ================================================================ +1.0 (??-Nov-95) First version supporting all adapters supported by the old + setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels + and then removed due to instability on some machines. +2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost + everything is configurable, the VESA support should be much more + stable, explicit mode numbering allowed, "scan" implemented etc. +2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution + supported. Few bugs fixed. VESA modes are listed prior to + modes supplied by SVGA autodetection as they are more reliable. + CLGD autodetect works better. Doesn't depend on 80x25 being + active when started. Scanning fixed. 80x43 (any VGA) added. + Code cleaned up. +2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX + VESA modes work now). Display end bug workaround supported. + Special modes renumbered to allow adding of the "recalculate" + flag, 0xffff and 0xfffe became aliases instead of real IDs. + Screen contents retained during mode changes. +2.3 (15-Mar-96) Changed to work with 1.3.74 kernel. +2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem + with some boot loaders. Memory management rewritten to reflect + these changes. Unfortunately, screen contents retaining works + only with some loaders now. + Added a Tseng 132x60 mode. +2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4. +2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on + several cards with broken VESA code (e.g., ATI VGA). +2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some + cards use very strange mode numbers. + - Added Realtek VGA modes (thanks to Gonzalo Tornaria). + - Hardware testing order slightly changed, tests based on ROM + contents done as first. + - Added support for special Video7 mode switching functions + (thanks to Tom Vander Aa). + - Added 480-scanline modes (especially useful for notebooks, + original version written by hhanemaa@cs.ruu.nl, patched by + Jeff Chua, rewritten by me). + - Screen store/restore fixed. +2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA. + - Better recognition of text modes during mode scan. +2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!) +2.10(11-Nov-96) - The whole thing made optional. + - Added the CONFIG_VIDEO_400_HACK switch. + - Added the CONFIG_VIDEO_GFX_HACK switch. + - Code cleanup. +2.11(03-May-97) - Yet another cleanup, now including also the documentation. + - Direct testing of SVGA adapters turned off by default, ``scan`` + offered explicitly on the prompt line. + - Removed the doc section describing adding of new probing + functions as I try to get rid of _all_ hardware probing here. +2.12(25-May-98) Added support for VESA frame buffer graphics. +2.13(14-May-99) Minor documentation fixes. +=============== ================================================================ diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a0c1d4ce403a..032c7cd3cede 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -327,7 +327,7 @@ when a hard lockup is detected. 0 - don't panic on hard lockup 1 - panic on hard lockup -See Documentation/lockup-watchdogs.txt for more information. This can +See Documentation/admin-guide/lockup-watchdogs.rst for more information. This can also be set using the nmi_watchdog kernel parameter. diff --git a/Documentation/admin-guide/video-output.rst b/Documentation/admin-guide/video-output.rst new file mode 100644 index 000000000000..56d6fa2e2368 --- /dev/null +++ b/Documentation/admin-guide/video-output.rst @@ -0,0 +1,34 @@ +Video Output Switcher Control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +2006 luming.yu@intel.com + +The output sysfs class driver provides an abstract video output layer that +can be used to hook platform specific methods to enable/disable video output +device through common sysfs interface. For example, on my IBM ThinkPad T42 +laptop, The ACPI video driver registered its output devices and read/write +method for 'state' with output sysfs class. The user interface under sysfs is:: + + linux:/sys/class/video_output # tree . + . + |-- CRT0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + |-- DVI0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + |-- LCD0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + `-- TV0 + |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + |-- state + |-- subsystem -> ../../../class/video_output + `-- uevent + diff --git a/Documentation/auxdisplay/lcd-panel-cgram.rst b/Documentation/auxdisplay/lcd-panel-cgram.rst deleted file mode 100644 index dfef50286018..000000000000 --- a/Documentation/auxdisplay/lcd-panel-cgram.rst +++ /dev/null @@ -1,29 +0,0 @@ -:orphan: - -====================================== -Parallel port LCD/Keypad Panel support -====================================== - -Some LCDs allow you to define up to 8 characters, mapped to ASCII -characters 0 to 7. The escape code to define a new character is -'\e[LG' followed by one digit from 0 to 7, representing the character -number, and up to 8 couples of hex digits terminated by a semi-colon -(';'). Each couple of digits represents a line, with 1-bits for each -illuminated pixel with LSB on the right. Lines are numbered from the -top of the character to the bottom. On a 5x7 matrix, only the 5 lower -bits of the 7 first bytes are used for each character. If the string -is incomplete, only complete lines will be redefined. Here are some -examples:: - - printf "\e[LG0010101050D1F0C04;" => 0 = [enter] - printf "\e[LG1040E1F0000000000;" => 1 = [up] - printf "\e[LG2000000001F0E0400;" => 2 = [down] - printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down] - printf "\e[LG40002060E1E0E0602;" => 4 = [left] - printf "\e[LG500080C0E0F0E0C08;" => 5 = [right] - printf "\e[LG60016051516141400;" => 6 = "IP" - - printf "\e[LG00103071F1F070301;" => big speaker - printf "\e[LG00002061E1E060200;" => small speaker - -Willy diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt deleted file mode 100644 index ec57740ead0c..000000000000 --- a/Documentation/btmrvl.txt +++ /dev/null @@ -1,124 +0,0 @@ -============= -btmrvl driver -============= - -All commands are used via debugfs interface. - -Set/get driver configurations -============================= - -Path: /debug/btmrvl/config/ - -gpiogap=[n], hscfgcmd - These commands are used to configure the host sleep parameters:: - bit 8:0 -- Gap - bit 16:8 -- GPIO - - where GPIO is the pin number of GPIO used to wake up the host. - It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface - wakeup will be used instead). - - where Gap is the gap in milli seconds between wakeup signal and - wakeup event, or 0xff for special host sleep setting. - - Usage:: - - # Use SDIO interface to wake up the host and set GAP to 0x80: - echo 0xff80 > /debug/btmrvl/config/gpiogap - echo 1 > /debug/btmrvl/config/hscfgcmd - - # Use GPIO pin #3 to wake up the host and set GAP to 0xff: - echo 0x03ff > /debug/btmrvl/config/gpiogap - echo 1 > /debug/btmrvl/config/hscfgcmd - -psmode=[n], pscmd - These commands are used to enable/disable auto sleep mode - - where the option is:: - - 1 -- Enable auto sleep mode - 0 -- Disable auto sleep mode - - Usage:: - - # Enable auto sleep mode - echo 1 > /debug/btmrvl/config/psmode - echo 1 > /debug/btmrvl/config/pscmd - - # Disable auto sleep mode - echo 0 > /debug/btmrvl/config/psmode - echo 1 > /debug/btmrvl/config/pscmd - - -hsmode=[n], hscmd - These commands are used to enable host sleep or wake up firmware - - where the option is:: - - 1 -- Enable host sleep - 0 -- Wake up firmware - - Usage:: - - # Enable host sleep - echo 1 > /debug/btmrvl/config/hsmode - echo 1 > /debug/btmrvl/config/hscmd - - # Wake up firmware - echo 0 > /debug/btmrvl/config/hsmode - echo 1 > /debug/btmrvl/config/hscmd - - -Get driver status -================= - -Path: /debug/btmrvl/status/ - -Usage:: - - cat /debug/btmrvl/status/ - -where the args are: - -curpsmode - This command displays current auto sleep status. - -psstate - This command display the power save state. - -hsstate - This command display the host sleep state. - -txdnldrdy - This command displays the value of Tx download ready flag. - -Issuing a raw hci command -========================= - -Use hcitool to issue raw hci command, refer to hcitool manual - -Usage:: - - Hcitool cmd [Parameters] - -Interface Control Command:: - - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface - -SD8688 firmware -=============== - -Images: - -- /lib/firmware/sd8688_helper.bin -- /lib/firmware/sd8688.bin - - -The images can be downloaded from: - -git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt deleted file mode 100644 index 211fd926cf00..000000000000 --- a/Documentation/clearing-warn-once.txt +++ /dev/null @@ -1,9 +0,0 @@ -Clearing WARN_ONCE ------------------- - -WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once. - -echo 1 > /sys/kernel/debug/clear_warn_once - -clears the state and allows the warnings to print once again. -This can be useful after test suite runs to reproduce problems. diff --git a/Documentation/cma/debugfs.rst b/Documentation/cma/debugfs.rst deleted file mode 100644 index 518fe401b5ee..000000000000 --- a/Documentation/cma/debugfs.rst +++ /dev/null @@ -1,27 +0,0 @@ -:orphan: - -===================== -CMA Debugfs Interface -===================== - -The CMA debugfs interface is useful to retrieve basic information out of the -different CMA areas and to test allocation/release in each of the areas. - -Each CMA zone represents a directory under /cma/, indexed by the -kernel's CMA index. So the first CMA zone would be: - - /cma/cma-0 - -The structure of the files created under that directory is as follows: - - - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. - - [RO] count: Amount of memory in the CMA area. - - [RO] order_per_bit: Order of pages represented by one bit. - - [RO] bitmap: The bitmap of page states in the zone. - - [WO] alloc: Allocate N pages from that CMA area. For example:: - - echo 5 > /cma/cma-2/alloc - -would try to allocate 5 pages from the cma-2 area. - - - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt deleted file mode 100644 index 2d01ce43d2a2..000000000000 --- a/Documentation/cpu-load.txt +++ /dev/null @@ -1,114 +0,0 @@ -======== -CPU load -======== - -Linux exports various bits of information via ``/proc/stat`` and -``/proc/uptime`` that userland tools, such as top(1), use to calculate -the average time system spent in a particular state, for example:: - - $ iostat - Linux 2.6.18.3-exp (linmac) 02/20/2007 - - avg-cpu: %user %nice %system %iowait %steal %idle - 10.01 0.00 2.92 5.44 0.00 81.63 - - ... - -Here the system thinks that over the default sampling period the -system spent 10.01% of the time doing work in user space, 2.92% in the -kernel, and was overall 81.63% of the time idle. - -In most cases the ``/proc/stat`` information reflects the reality quite -closely, however due to the nature of how/when the kernel collects -this data sometimes it can not be trusted at all. - -So how is this information collected? Whenever timer interrupt is -signalled the kernel looks what kind of task was running at this -moment and increments the counter that corresponds to this tasks -kind/state. The problem with this is that the system could have -switched between various states multiple times between two timer -interrupts yet the counter is incremented only for the last state. - - -Example -------- - -If we imagine the system with one task that periodically burns cycles -in the following manner:: - - time line between two timer interrupts - |--------------------------------------| - ^ ^ - |_ something begins working | - |_ something goes to sleep - (only to be awaken quite soon) - -In the above situation the system will be 0% loaded according to the -``/proc/stat`` (since the timer interrupt will always happen when the -system is executing the idle handler), but in reality the load is -closer to 99%. - -One can imagine many more situations where this behavior of the kernel -will lead to quite erratic information inside ``/proc/stat``:: - - - /* gcc -o hog smallhog.c */ - #include - #include - #include - #include - #define HIST 10 - - static volatile sig_atomic_t stop; - - static void sighandler (int signr) - { - (void) signr; - stop = 1; - } - static unsigned long hog (unsigned long niters) - { - stop = 0; - while (!stop && --niters); - return niters; - } - int main (void) - { - int i; - struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, - .it_value = { .tv_sec = 0, .tv_usec = 1 } }; - sigset_t set; - unsigned long v[HIST]; - double tmp = 0.0; - unsigned long n; - signal (SIGALRM, &sighandler); - setitimer (ITIMER_REAL, &it, NULL); - - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) tmp += v[i]; - tmp /= HIST; - n = tmp - (tmp / 3.0); - - sigemptyset (&set); - sigaddset (&set, SIGALRM); - - for (;;) { - hog (n); - sigwait (&set, &i); - } - return 0; - } - - -References ----------- - -- http://lkml.org/lkml/2007/2/12/6 -- Documentation/filesystems/proc.txt (1.8) - - -Thanks ------- - -Con Kolivas, Pavel Machek diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt deleted file mode 100644 index b90dafcc8237..000000000000 --- a/Documentation/cputopology.txt +++ /dev/null @@ -1,177 +0,0 @@ -=========================================== -How CPU topology info is exported via sysfs -=========================================== - -Export CPU topology info via sysfs. Items (attributes) are similar -to /proc/cpuinfo output of some architectures. They reside in -/sys/devices/system/cpu/cpuX/topology/: - -physical_package_id: - - physical package id of cpuX. Typically corresponds to a physical - socket number, but the actual value is architecture and platform - dependent. - -die_id: - - the CPU die ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -core_id: - - the CPU core ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -book_id: - - the book ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -drawer_id: - - the drawer ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -core_cpus: - - internal kernel map of CPUs within the same core. - (deprecated name: "thread_siblings") - -core_cpus_list: - - human-readable list of CPUs within the same core. - (deprecated name: "thread_siblings_list"); - -package_cpus: - - internal kernel map of the CPUs sharing the same physical_package_id. - (deprecated name: "core_siblings") - -package_cpus_list: - - human-readable list of CPUs sharing the same physical_package_id. - (deprecated name: "core_siblings_list") - -die_cpus: - - internal kernel map of CPUs within the same die. - -die_cpus_list: - - human-readable list of CPUs within the same die. - -book_siblings: - - internal kernel map of cpuX's hardware threads within the same - book_id. - -book_siblings_list: - - human-readable list of cpuX's hardware threads within the same - book_id. - -drawer_siblings: - - internal kernel map of cpuX's hardware threads within the same - drawer_id. - -drawer_siblings_list: - - human-readable list of cpuX's hardware threads within the same - drawer_id. - -Architecture-neutral, drivers/base/topology.c, exports these attributes. -However, the book and drawer related sysfs files will only be created if -CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. - -CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, -where they reflect the cpu and cache hierarchy. - -For an architecture to support this feature, it must define some of -these macros in include/asm-XXX/topology.h:: - - #define topology_physical_package_id(cpu) - #define topology_die_id(cpu) - #define topology_core_id(cpu) - #define topology_book_id(cpu) - #define topology_drawer_id(cpu) - #define topology_sibling_cpumask(cpu) - #define topology_core_cpumask(cpu) - #define topology_die_cpumask(cpu) - #define topology_book_cpumask(cpu) - #define topology_drawer_cpumask(cpu) - -The type of ``**_id macros`` is int. -The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter -correspond with appropriate ``**_siblings`` sysfs attributes (except for -topology_sibling_cpumask() which corresponds with thread_siblings). - -To be consistent on all architectures, include/linux/topology.h -provides default definitions for any of the above macros that are -not defined by include/asm-XXX/topology.h: - -1) topology_physical_package_id: -1 -2) topology_die_id: -1 -3) topology_core_id: 0 -4) topology_sibling_cpumask: just the given CPU -5) topology_core_cpumask: just the given CPU -6) topology_die_cpumask: just the given CPU - -For architectures that don't support books (CONFIG_SCHED_BOOK) there are no -default definitions for topology_book_id() and topology_book_cpumask(). -For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are -no default definitions for topology_drawer_id() and topology_drawer_cpumask(). - -Additionally, CPU topology information is provided under -/sys/devices/system/cpu and includes these files. The internal -source for the output is in brackets ("[]"). - - =========== ========================================================== - kernel_max: the maximum CPU index allowed by the kernel configuration. - [NR_CPUS-1] - - offline: CPUs that are not online because they have been - HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit - of CPUs allowed by the kernel configuration (kernel_max - above). [~cpu_online_mask + cpus >= NR_CPUS] - - online: CPUs that are online and being scheduled [cpu_online_mask] - - possible: CPUs that have been allocated resources and can be - brought online if they are present. [cpu_possible_mask] - - present: CPUs that have been identified as being present in the - system. [cpu_present_mask] - =========== ========================================================== - -The format for the above output is compatible with cpulist_parse() -[see ]. Some examples follow. - -In this example, there are 64 CPUs in the system but cpus 32-63 exceed -the kernel max which is limited to 0..31 by the NR_CPUS config option -being 32. Note also that CPUs 2 and 4-31 are not online but could be -brought online as they are both present and possible:: - - kernel_max: 31 - offline: 2,4-31,32-63 - online: 0-1,3 - possible: 0-31 - present: 0-31 - -In this example, the NR_CPUS config option is 128, but the kernel was -started with possible_cpus=144. There are 4 CPUs in the system and cpu2 -was manually taken offline (and is the only CPU that can be brought -online.):: - - kernel_max: 127 - offline: 2,4-127,128-143 - online: 0-1,3 - possible: 0-127 - present: 0-3 - -See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter -as well as more information on the various cpumasks. diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt deleted file mode 100644 index 833edb0d0bc4..000000000000 --- a/Documentation/efi-stub.txt +++ /dev/null @@ -1,100 +0,0 @@ -================= -The EFI Boot Stub -================= - -On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade -as a PE/COFF image, thereby convincing EFI firmware loaders to load -it as an EFI executable. The code that modifies the bzImage header, -along with the EFI-specific entry point that the firmware loader -jumps to are collectively known as the "EFI boot stub", and live in -arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, -respectively. For ARM the EFI stub is implemented in -arch/arm/boot/compressed/efi-header.S and -arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared -between architectures is in drivers/firmware/efi/libstub. - -For arm64, there is no compressed kernel support, so the Image itself -masquerades as a PE/COFF image and the EFI stub is linked into the -kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S -and drivers/firmware/efi/libstub/arm64-stub.c. - -By using the EFI boot stub it's possible to boot a Linux kernel -without the use of a conventional EFI boot loader, such as grub or -elilo. Since the EFI boot stub performs the jobs of a boot loader, in -a certain sense it *IS* the boot loader. - -The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. - - -How to install bzImage.efi --------------------------- - -The bzImage located in arch/x86/boot/bzImage must be copied to the EFI -System Partition (ESP) and renamed with the extension ".efi". Without -the extension the EFI firmware loader will refuse to execute it. It's -not possible to execute bzImage.efi from the usual Linux file systems -because EFI firmware doesn't have support for them. For ARM the -arch/arm/boot/zImage should be copied to the system partition, and it -may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image -should be copied but not necessarily renamed. - - -Passing kernel parameters from the EFI shell --------------------------------------------- - -Arguments to the kernel can be passed after bzImage.efi, e.g.:: - - fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 - - -The "initrd=" option --------------------- - -Like most boot loaders, the EFI stub allows the user to specify -multiple initrd files using the "initrd=" option. This is the only EFI -stub-specific command line parameter, everything else is passed to the -kernel when it boots. - -The path to the initrd file must be an absolute path from the -beginning of the ESP, relative path names do not work. Also, the path -is an EFI-style path and directory elements must be separated with -backslashes (\). For example, given the following directory layout:: - - fs0:> - Kernels\ - bzImage.efi - initrd-large.img - - Ramdisks\ - initrd-small.img - initrd-medium.img - -to boot with the initrd-large.img file if the current working -directory is fs0:\Kernels, the following command must be used:: - - fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img - -Notice how bzImage.efi can be specified with a relative path. That's -because the image we're executing is interpreted by the EFI shell, -which understands relative paths, whereas the rest of the command line -is passed to bzImage.efi. - - -The "dtb=" option ------------------ - -For the ARM and arm64 architectures, a device tree must be provided to -the kernel. Normally firmware shall supply the device tree via the -EFI CONFIGURATION TABLE. However, the "dtb=" command line option can -be used to override the firmware supplied device tree, or to supply -one when firmware is unable to. - -Please note: Firmware adds runtime configuration information to the -device tree before booting the kernel. If dtb= is used to override -the device tree, then any runtime data provided by firmware will be -lost. The dtb= option should only be used either as a debug tool, or -as a last resort when a device tree is not provided in the EFI -CONFIGURATION TABLE. - -"dtb=" is processed in the same manner as the "initrd=" option that is -described above. diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst index 2ed0dfb661cf..6821c87b7893 100644 --- a/Documentation/fb/vesafb.rst +++ b/Documentation/fb/vesafb.rst @@ -30,7 +30,7 @@ How to use it? ============== Switching modes is done using the vga=... boot parameter. Read -Documentation/svga.txt for details. +Documentation/admin-guide/svga.rst for details. You should compile in both vgacon (for text mode) and vesafb (for graphics mode). Which of them takes over the console depends on diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt deleted file mode 100644 index 6ee70465c0ea..000000000000 --- a/Documentation/highuid.txt +++ /dev/null @@ -1,80 +0,0 @@ -=================================================== -Notes on the change from 16-bit UIDs to 32-bit UIDs -=================================================== - -:Author: Chris Wing -:Last updated: January 11, 2000 - -- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t - when communicating between user and kernel space in an ioctl or data - structure. - -- kernel code should use uid_t and gid_t in kernel-private structures and - code. - -What's left to be done for 32-bit UIDs on all Linux architectures: - -- Disk quotas have an interesting limitation that is not related to the - maximum UID/GID. They are limited by the maximum file size on the - underlying filesystem, because quota records are written at offsets - corresponding to the UID in question. - Further investigation is needed to see if the quota system can cope - properly with huge UIDs. If it can deal with 64-bit file offsets on all - architectures, this should not be a problem. - -- Decide whether or not to keep backwards compatibility with the system - accounting file, or if we should break it as the comments suggest - (currently, the old 16-bit UID and GID are still written to disk, and - part of the former pad space is used to store separate 32-bit UID and - GID) - -- Need to validate that OS emulation calls the 16-bit UID - compatibility syscalls, if the OS being emulated used 16-bit UIDs, or - uses the 32-bit UID system calls properly otherwise. - - This affects at least: - - - iBCS on Intel - - - sparc32 emulation on sparc64 - (need to support whatever new 32-bit UID system calls are added to - sparc32) - -- Validate that all filesystems behave properly. - - At present, 32-bit UIDs _should_ work for: - - - ext2 - - ufs - - isofs - - nfs - - coda - - udf - - Ioctl() fixups have been made for: - - - ncpfs - - smbfs - - Filesystems with simple fixups to prevent 16-bit UID wraparound: - - - minix - - sysv - - qnx4 - - Other filesystems have not been checked yet. - -- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in - all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but - more are needed. (as well as new user<->kernel data structures) - -- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, - sh, and sparc32. Fixing this is probably not that important, but would - require adding a new ELF section. - -- The ioctl()s used to control the in-kernel NFS server only support - 16-bit UIDs on arm, i386, m68k, sh, and sparc32. - -- make sure that the UID mapping feature of AX25 networking works properly - (it should be safe because it's always used a 32-bit integer to - communicate between user and kernel) diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt deleted file mode 100644 index 121de96e395e..000000000000 --- a/Documentation/hw_random.txt +++ /dev/null @@ -1,105 +0,0 @@ -========================================================== -Linux support for random number generator in i8xx chipsets -========================================================== - -Introduction -============ - -The hw_random framework is software that makes use of a -special hardware feature on your CPU or motherboard, -a Random Number Generator (RNG). The software has two parts: -a core providing the /dev/hwrng character device and its -sysfs support, plus a hardware-specific driver that plugs -into that core. - -To make the most effective use of these mechanisms, you -should download the support software as well. Download the -latest version of the "rng-tools" package from the -hw_random driver's official Web site: - - http://sourceforge.net/projects/gkernel/ - -Those tools use /dev/hwrng to fill the kernel entropy pool, -which is used internally and exported by the /dev/urandom and -/dev/random special files. - -Theory of operation -=================== - -CHARACTER DEVICE. Using the standard open() -and read() system calls, you can read random data from -the hardware RNG device. This data is NOT CHECKED by any -fitness tests, and could potentially be bogus (if the -hardware is faulty or has been tampered with). Data is only -output if the hardware "has-data" flag is set, but nevertheless -a security-conscious person would run fitness tests on the -data before assuming it is truly random. - -The rng-tools package uses such tests in "rngd", and lets you -run them by hand with a "rngtest" utility. - -/dev/hwrng is char device major 10, minor 183. - -CLASS DEVICE. There is a /sys/class/misc/hw_random node with -two unique attributes, "rng_available" and "rng_current". The -"rng_available" attribute lists the hardware-specific drivers -available, while "rng_current" lists the one which is currently -connected to /dev/hwrng. If your system has more than one -RNG available, you may change the one used by writing a name from -the list in "rng_available" into "rng_current". - -========================================================================== - - -Hardware driver for Intel/AMD/VIA Random Number Generators (RNG) - - Copyright 2000,2001 Jeff Garzik - - Copyright 2000,2001 Philipp Rumpf - - -About the Intel RNG hardware, from the firmware hub datasheet -============================================================= - -The Firmware Hub integrates a Random Number Generator (RNG) -using thermal noise generated from inherently random quantum -mechanical properties of silicon. When not generating new random -bits the RNG circuitry will enter a low power state. Intel will -provide a binary software driver to give third party software -access to our RNG for use as a security feature. At this time, -the RNG is only to be used with a system in an OS-present state. - -Intel RNG Driver notes -====================== - -FIXME: support poll(2) - -.. note:: - - request_mem_region was removed, for three reasons: - - 1) Only one RNG is supported by this driver; - 2) The location used by the RNG is a fixed location in - MMIO-addressable memory; - 3) users with properly working BIOS e820 handling will always - have the region in which the RNG is located reserved, so - request_mem_region calls always fail for proper setups. - However, for people who use mem=XX, BIOS e820 information is - **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can - succeed. - -Driver details -============== - -Based on: - Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet - May 1999 Order Number: 290658-002 R - -Intel 82802 Firmware Hub: - Random Number Generator - Programmer's Reference Manual - December 1999 Order Number: 298029-001 R - -Intel 82802 Firmware HUB Random Number Generator Driver - Copyright (c) 2000 Matt Sottek - -Special thanks to Matt Sottek. I did the "guts", he -did the "brains" and all the testing. diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt deleted file mode 100644 index 5d63b18bd6d1..000000000000 --- a/Documentation/iostats.txt +++ /dev/null @@ -1,197 +0,0 @@ -===================== -I/O statistics fields -===================== - -Since 2.4.20 (and some versions before, with patches), and 2.5.45, -more extensive disk statistics have been introduced to help measure disk -activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do -the work for you, but in case you are interested in creating your own -tools, the fields are explained here. - -In 2.4 now, the information is found as additional fields in -``/proc/partitions``. In 2.6 and upper, the same information is found in two -places: one is in the file ``/proc/diskstats``, and the other is within -the sysfs file system, which must be mounted in order to obtain -the information. Throughout this document we'll assume that sysfs -is mounted on ``/sys``, although of course it may be mounted anywhere. -Both ``/proc/diskstats`` and sysfs use the same source for the information -and so should not differ. - -Here are examples of these different formats:: - - 2.4: - 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 - - 2.6+ sysfs: - 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 35486 38030 38030 38030 - - 2.6+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 hda1 35486 38030 38030 38030 - - 4.18+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 - -On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have -a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. - -The advantage of one over the other is that the sysfs choice works well -if you are watching a known, small set of disks. ``/proc/diskstats`` may -be a better choice if you are watching a large number of disks because -you'll avoid the overhead of 50, 100, or 500 or more opens/closes with -each snapshot of your disk statistics. - -In 2.4, the statistics fields are those after the device name. In -the above example, the first field of statistics would be 446216. -By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll -find just the eleven fields, beginning with 446216. If you look at -``/proc/diskstats``, the eleven fields will be preceded by the major and -minor device numbers, and device name. Each of these formats provides -eleven fields of statistics, each meaning exactly the same things. -All fields except field 9 are cumulative since boot. Field 9 should -go to zero as I/Os complete; all others only increase (unless they -overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long -(native word size) numbers, and on a very busy or long-lived system they -may wrap. Applications should be prepared to deal with that; unless -your observations are measured in large numbers of minutes or hours, -they should not wrap twice before you notice them. - -Each set of stats only applies to the indicated device; if you want -system-wide stats you'll have to find all the devices and sum them all up. - -Field 1 -- # of reads completed - This is the total number of reads completed successfully. - -Field 2 -- # of reads merged, field 6 -- # of writes merged - Reads and writes which are adjacent to each other may be merged for - efficiency. Thus two 4K reads may become one 8K read before it is - ultimately handed to the disk, and so it will be counted (and queued) - as only one I/O. This field lets you know how often this was done. - -Field 3 -- # of sectors read - This is the total number of sectors read successfully. - -Field 4 -- # of milliseconds spent reading - This is the total number of milliseconds spent by all reads (as - measured from __make_request() to end_that_request_last()). - -Field 5 -- # of writes completed - This is the total number of writes completed successfully. - -Field 6 -- # of writes merged - See the description of field 2. - -Field 7 -- # of sectors written - This is the total number of sectors written successfully. - -Field 8 -- # of milliseconds spent writing - This is the total number of milliseconds spent by all writes (as - measured from __make_request() to end_that_request_last()). - -Field 9 -- # of I/Os currently in progress - The only field that should go to zero. Incremented as requests are - given to appropriate struct request_queue and decremented as they finish. - -Field 10 -- # of milliseconds spent doing I/Os - This field increases so long as field 9 is nonzero. - - Since 5.0 this field counts jiffies when at least one request was - started or completed. If request runs more than 2 jiffies then some - I/O time will not be accounted unless there are other requests. - -Field 11 -- weighted # of milliseconds spent doing I/Os - This field is incremented at each I/O start, I/O completion, I/O - merge, or read of these stats by the number of I/Os in progress - (field 9) times the number of milliseconds spent doing I/O since the - last update of this field. This can provide an easy measure of both - I/O completion time and the backlog that may be accumulating. - -Field 12 -- # of discards completed - This is the total number of discards completed successfully. - -Field 13 -- # of discards merged - See the description of field 2 - -Field 14 -- # of sectors discarded - This is the total number of sectors discarded successfully. - -Field 15 -- # of milliseconds spent discarding - This is the total number of milliseconds spent by all discards (as - measured from __make_request() to end_that_request_last()). - -To avoid introducing performance bottlenecks, no locks are held while -modifying these counters. This implies that minor inaccuracies may be -introduced when changes collide, so (for instance) adding up all the -read I/Os issued per partition should equal those made to the disks ... -but due to the lack of locking it may only be very close. - -In 2.6+, there are counters for each CPU, which make the lack of locking -almost a non-issue. When the statistics are read, the per-CPU counters -are summed (possibly overflowing the unsigned long variable they are -summed to) and the result given to the user. There is no convenient -user interface for accessing the per-CPU counters themselves. - -Disks vs Partitions -------------------- - -There were significant changes between 2.4 and 2.6+ in the I/O subsystem. -As a result, some statistic information disappeared. The translation from -a disk address relative to a partition to the disk address relative to -the host disk happens much earlier. All merges and timings now happen -at the disk level rather than at both the disk and partition level as -in 2.4. Consequently, you'll see a different statistics output on 2.6+ for -partitions from that for disks. There are only *four* fields available -for partitions on 2.6+ machines. This is reflected in the examples above. - -Field 1 -- # of reads issued - This is the total number of reads issued to this partition. - -Field 2 -- # of sectors read - This is the total number of sectors requested to be read from this - partition. - -Field 3 -- # of writes issued - This is the total number of writes issued to this partition. - -Field 4 -- # of sectors written - This is the total number of sectors requested to be written to - this partition. - -Note that since the address is translated to a disk-relative one, and no -record of the partition-relative address is kept, the subsequent success -or failure of the read cannot be attributed to the partition. In other -words, the number of reads for partitions is counted slightly before time -of queuing for partitions, and at completion for whole disks. This is -a subtle distinction that is probably uninteresting for most cases. - -More significant is the error induced by counting the numbers of -reads/writes before merges for partitions and after for disks. Since a -typical workload usually contains a lot of successive and adjacent requests, -the number of reads/writes issued can be several times higher than the -number of reads/writes completed. - -In 2.6.25, the full statistic set is again available for partitions and -disk and partition statistics are consistent again. Since we still don't -keep record of the partition-relative address, an operation is attributed to -the partition which contains the first sector of the request after the -eventual merges. As requests can be merged across partition, this could lead -to some (probably insignificant) inaccuracy. - -Additional notes ----------------- - -In 2.6+, sysfs is not mounted by default. If your distribution of -Linux hasn't added it already, here's the line you'll want to add to -your ``/etc/fstab``:: - - none /sys sysfs defaults 0 0 - - -In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they -appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in -``/proc/stat`` take a very different format from those in ``/proc/partitions`` -(see proc(5), if your system has it.) - --- ricklind@us.ibm.com diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt deleted file mode 100644 index 4f18456dd3b1..000000000000 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ /dev/null @@ -1,356 +0,0 @@ -========================================== -Reducing OS jitter due to per-cpu kthreads -========================================== - -This document lists per-CPU kthreads in the Linux kernel and presents -options to control their OS jitter. Note that non-per-CPU kthreads are -not listed here. To reduce OS jitter from non-per-CPU kthreads, bind -them to a "housekeeping" CPU dedicated to such work. - -References -========== - -- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. - -- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. - -- man taskset: Using the taskset command to bind tasks to sets - of CPUs. - -- man sched_setaffinity: Using the sched_setaffinity() system - call to bind tasks to sets of CPUs. - -- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, - writing "0" to offline and "1" to online. - -- In order to locate kernel-generated OS jitter on CPU N: - - cd /sys/kernel/debug/tracing - echo 1 > max_graph_depth # Increase the "1" for more detail - echo function_graph > current_tracer - # run workload - cat per_cpu/cpuN/trace - -kthreads -======== - -Name: - ehca_comp/%u - -Purpose: - Periodically process Infiniband-related work. - -To reduce its OS jitter, do any of the following: - -1. Don't use eHCA Infiniband hardware, instead choosing hardware - that does not require per-CPU kthreads. This will prevent these - kthreads from being created in the first place. (This will - work for most people, as this hardware, though important, is - relatively old and is produced in relatively low unit volumes.) -2. Do all eHCA-Infiniband-related work on other CPUs, including - interrupts. -3. Rework the eHCA driver so that its per-CPU kthreads are - provisioned only on selected CPUs. - - -Name: - irq/%d-%s - -Purpose: - Handle threaded interrupts. - -To reduce its OS jitter, do the following: - -1. Use irq affinity to force the irq threads to execute on - some other CPU. - -Name: - kcmtpd_ctr_%d - -Purpose: - Handle Bluetooth work. - -To reduce its OS jitter, do one of the following: - -1. Don't use Bluetooth, in which case these kthreads won't be - created in the first place. -2. Use irq affinity to force Bluetooth-related interrupts to - occur on some other CPU and furthermore initiate all - Bluetooth activity on some other CPU. - -Name: - ksoftirqd/%u - -Purpose: - Execute softirq handlers when threaded or when under heavy load. - -To reduce its OS jitter, each softirq vector must be handled -separately as follows: - -TIMER_SOFTIRQ -------------- - -Do all of the following: - -1. To the extent possible, keep the CPU out of the kernel when it - is non-idle, for example, by avoiding system calls and by forcing - both kernel threads and interrupts to execute elsewhere. -2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force - the CPU offline, then bring it back online. This forces - recurring timers to migrate elsewhere. If you are concerned - with multiple CPUs, force them all offline before bringing the - first one back online. Once you have onlined the CPUs in question, - do not offline any other CPUs, because doing so could force the - timer back onto one of the CPUs in question. - -NET_TX_SOFTIRQ and NET_RX_SOFTIRQ ---------------------------------- - -Do all of the following: - -1. Force networking interrupts onto other CPUs. -2. Initiate any network I/O on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -BLOCK_SOFTIRQ -------------- - -Do all of the following: - -1. Force block-device interrupts onto some other CPU. -2. Initiate any block I/O on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -IRQ_POLL_SOFTIRQ ----------------- - -Do all of the following: - -1. Force block-device interrupts onto some other CPU. -2. Initiate any block I/O and block-I/O polling on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -TASKLET_SOFTIRQ ---------------- - -Do one or more of the following: - -1. Avoid use of drivers that use tasklets. (Such drivers will contain - calls to things like tasklet_schedule().) -2. Convert all drivers that you must use from tasklets to workqueues. -3. Force interrupts for drivers using tasklets onto other CPUs, - and also do I/O involving these drivers on other CPUs. - -SCHED_SOFTIRQ -------------- - -Do all of the following: - -1. Avoid sending scheduler IPIs to the CPU to be de-jittered, - for example, ensure that at most one runnable kthread is present - on that CPU. If a thread that expects to run on the de-jittered - CPU awakens, the scheduler will send an IPI that can result in - a subsequent SCHED_SOFTIRQ. -2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered - is marked as an adaptive-ticks CPU using the "nohz_full=" - boot parameter. This reduces the number of scheduler-clock - interrupts that the de-jittered CPU receives, minimizing its - chances of being selected to do the load balancing work that - runs in SCHED_SOFTIRQ context. -3. To the extent possible, keep the CPU out of the kernel when it - is non-idle, for example, by avoiding system calls and by - forcing both kernel threads and interrupts to execute elsewhere. - This further reduces the number of scheduler-clock interrupts - received by the de-jittered CPU. - -HRTIMER_SOFTIRQ ---------------- - -Do all of the following: - -1. To the extent possible, keep the CPU out of the kernel when it - is non-idle. For example, avoid system calls and force both - kernel threads and interrupts to execute elsewhere. -2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the - CPU offline, then bring it back online. This forces recurring - timers to migrate elsewhere. If you are concerned with multiple - CPUs, force them all offline before bringing the first one - back online. Once you have onlined the CPUs in question, do not - offline any other CPUs, because doing so could force the timer - back onto one of the CPUs in question. - -RCU_SOFTIRQ ------------ - -Do at least one of the following: - -1. Offload callbacks and keep the CPU in either dyntick-idle or - adaptive-ticks state by doing all of the following: - - a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be - de-jittered is marked as an adaptive-ticks CPU using the - "nohz_full=" boot parameter. Bind the rcuo kthreads to - housekeeping CPUs, which can tolerate OS jitter. - b. To the extent possible, keep the CPU out of the kernel - when it is non-idle, for example, by avoiding system - calls and by forcing both kernel threads and interrupts - to execute elsewhere. - -2. Enable RCU to do its processing remotely via dyntick-idle by - doing all of the following: - - a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. - b. Ensure that the CPU goes idle frequently, allowing other - CPUs to detect that it has passed through an RCU quiescent - state. If the kernel is built with CONFIG_NO_HZ_FULL=y, - userspace execution also allows other CPUs to detect that - the CPU in question has passed through a quiescent state. - c. To the extent possible, keep the CPU out of the kernel - when it is non-idle, for example, by avoiding system - calls and by forcing both kernel threads and interrupts - to execute elsewhere. - -Name: - kworker/%u:%d%s (cpu, id, priority) - -Purpose: - Execute workqueue requests - -To reduce its OS jitter, do any of the following: - -1. Run your workload at a real-time priority, which will allow - preempting the kworker daemons. -2. A given workqueue can be made visible in the sysfs filesystem - by passing the WQ_SYSFS to that workqueue's alloc_workqueue(). - Such a workqueue can be confined to a given subset of the - CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs - files. The set of WQ_SYSFS workqueues can be displayed using - "ls sys/devices/virtual/workqueue". That said, the workqueues - maintainer would like to caution people against indiscriminately - sprinkling WQ_SYSFS across all the workqueues. The reason for - caution is that it is easy to add WQ_SYSFS, but because sysfs is - part of the formal user/kernel API, it can be nearly impossible - to remove it, even if its addition was a mistake. -3. Do any of the following needed to avoid jitter that your - application cannot tolerate: - - a. Build your kernel with CONFIG_SLUB=y rather than - CONFIG_SLAB=y, thus avoiding the slab allocator's periodic - use of each CPU's workqueues to run its cache_reap() - function. - b. Avoid using oprofile, thus avoiding OS jitter from - wq_sync_buffer(). - c. Limit your CPU frequency so that a CPU-frequency - governor is not required, possibly enlisting the aid of - special heatsinks or other cooling technologies. If done - correctly, and if you CPU architecture permits, you should - be able to build your kernel with CONFIG_CPU_FREQ=n to - avoid the CPU-frequency governor periodically running - on each CPU, including cs_dbs_timer() and od_dbs_timer(). - - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - d. As of v3.18, Christoph Lameter's on-demand vmstat workers - commit prevents OS jitter due to vmstat_update() on - CONFIG_SMP=y systems. Before v3.18, is not possible - to entirely get rid of the OS jitter, but you can - decrease its frequency by writing a large value to - /proc/sys/vm/stat_interval. The default value is HZ, - for an interval of one second. Of course, larger values - will make your virtual-memory statistics update more - slowly. Of course, you can also run your workload at - a real-time priority, thus preempting vmstat_update(), - but if your workload is CPU-bound, this is a bad idea. - However, there is an RFC patch from Christoph Lameter - (based on an earlier one from Gilad Ben-Yossef) that - reduces or even eliminates vmstat overhead for some - workloads at https://lkml.org/lkml/2013/9/4/379. - e. Boot with "elevator=noop" to avoid workqueue use by - the block layer. - f. If running on high-end powerpc servers, build with - CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS - daemon from running on each CPU every second or so. - (This will require editing Kconfig files and will defeat - this platform's RAS functionality.) This avoids jitter - due to the rtas_event_scan() function. - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - g. If running on Cell Processor, build your kernel with - CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from - spu_gov_work(). - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - h. If running on PowerMAC, build your kernel with - CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, - avoiding OS jitter from rackmeter_do_timer(). - -Name: - rcuc/%u - -Purpose: - Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. - -To reduce its OS jitter, do at least one of the following: - -1. Build the kernel with CONFIG_PREEMPT=n. This prevents these - kthreads from being created in the first place, and also obviates - the need for RCU priority boosting. This approach is feasible - for workloads that do not require high degrees of responsiveness. -2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these - kthreads from being created in the first place. This approach - is feasible only if your workload never requires RCU priority - boosting, for example, if you ensure frequent idle time on all - CPUs that might execute within the kernel. -3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= - boot parameter offloading RCU callbacks from all CPUs susceptible - to OS jitter. This approach prevents the rcuc/%u kthreads from - having any work to do, so that they are never awakened. -4. Ensure that the CPU never enters the kernel, and, in particular, - avoid initiating any CPU hotplug operations on this CPU. This is - another way of preventing any callbacks from being queued on the - CPU, again preventing the rcuc/%u kthreads from having any work - to do. - -Name: - rcuop/%d and rcuos/%d - -Purpose: - Offload RCU callbacks from the corresponding CPU. - -To reduce its OS jitter, do at least one of the following: - -1. Use affinity, cgroups, or other mechanism to force these kthreads - to execute on some other CPU. -2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these - kthreads from being created in the first place. However, please - note that this will not eliminate OS jitter, but will instead - shift it to RCU_SOFTIRQ. - -Name: - watchdog/%u - -Purpose: - Detect software lockups on each CPU. - -To reduce its OS jitter, do at least one of the following: - -1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these - kthreads from being created in the first place. -2. Boot with "nosoftlockup=0", which will also prevent these kthreads - from being created. Other related watchdog and softlockup boot - parameters may be found in Documentation/admin-guide/kernel-parameters.rst - and Documentation/watchdog/watchdog-parameters.rst. -3. Echo a zero to /proc/sys/kernel/watchdog to disable the - watchdog timer. -4. Echo a large number of /proc/sys/kernel/watchdog_thresh in - order to reduce the frequency of OS jitter due to the watchdog - timer down to a level that is acceptable for your workload. diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt deleted file mode 100644 index 12c571368e73..000000000000 --- a/Documentation/ldm.txt +++ /dev/null @@ -1,121 +0,0 @@ -========================================== -LDM - Logical Disk Manager (Dynamic Disks) -========================================== - -:Author: Originally Written by FlatCap - Richard Russon . -:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista. - -Overview --------- - -Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete -replacement for the MSDOS style partitions. It stores its information in a -1MiB journalled database at the end of the physical disk. The size of -partitions is limited only by disk space. The maximum number of partitions is -nearly 2000. - -Any partitions created under the LDM are called "Dynamic Disks". There are no -longer any primary or extended partitions. Normal MSDOS style partitions are -now known as Basic Disks. - -If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use -Dynamic Disks. The journalling allows Windows to make changes to these -partitions and filesystems without the need to reboot. - -Once the LDM driver has divided up the disk, you can use the MD driver to -assemble any multi-partition volumes, e.g. Stripes, RAID5. - -To prevent legacy applications from repartitioning the disk, the LDM creates a -dummy MSDOS partition containing one disk-sized partition. This is what is -supported with the Linux LDM driver. - -A newer approach that has been implemented with Vista is to put LDM on top of a -GPT label disk. This is not supported by the Linux LDM driver yet. - - -Example -------- - -Below we have a 50MiB disk, divided into seven partitions. - -.. note:: - - The missing 1MiB at the end of the disk is where the LDM database is - stored. - -+-------++--------------+---------+-----++--------------+---------+----+ -|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB| -+=======++==============+=========+=====++==============+=========+====+ -|hda || 0 | 0 | 0 || 52428800 | 102400 | 50| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13| -+-------++--------------+---------+-----++--------------+---------+----+ - -The LDM Database may not store the partitions in the order that they appear on -disk, but the driver will sort them. - -When Linux boots, you will see something like:: - - hda: 102400 sectors w/32KiB Cache, CHS=50/64/32 - hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7 - - -Compiling LDM Support ---------------------- - -To enable LDM, choose the following two options: - - - "Advanced partition selection" CONFIG_PARTITION_ADVANCED - - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION - -If you believe the driver isn't working as it should, you can enable the extra -debugging code. This will produce a LOT of output. The option is: - - - "Windows LDM extra logging" CONFIG_LDM_DEBUG - -N.B. The partition code cannot be compiled as a module. - -As with all the partition code, if the driver doesn't see signs of its type of -partition, it will pass control to another driver, so there is no harm in -enabling it. - -If you have Dynamic Disks but don't enable the driver, then all you will see -is a dummy MSDOS partition filling the whole disk. You won't be able to mount -any of the volumes on the disk. - - -Booting -------- - -If you enable LDM support, then lilo is capable of booting from any of the -discovered partitions. However, grub does not understand the LDM partitioning -and cannot boot from a Dynamic Disk. - - -More Documentation ------------------- - -There is an Overview of the LDM together with complete Technical Documentation. -It is available for download. - - http://www.linux-ntfs.org/ - -If you have any LDM questions that aren't answered in the documentation, email -me. - -Cheers, - FlatCap - Richard Russon - ldm@flatcap.org - diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt deleted file mode 100644 index 290840c160af..000000000000 --- a/Documentation/lockup-watchdogs.txt +++ /dev/null @@ -1,83 +0,0 @@ -=============================================================== -Softlockup detector and hardlockup detector (aka nmi_watchdog) -=============================================================== - -The Linux kernel can act as a watchdog to detect both soft and hard -lockups. - -A 'softlockup' is defined as a bug that causes the kernel to loop in -kernel mode for more than 20 seconds (see "Implementation" below for -details), without giving other tasks a chance to run. The current -stack trace is displayed upon detection and, by default, the system -will stay locked up. Alternatively, the kernel can be configured to -panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, -"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for -details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are -provided for this. - -A 'hardlockup' is defined as a bug that causes the CPU to loop in -kernel mode for more than 10 seconds (see "Implementation" below for -details), without letting other interrupts have a chance to run. -Similarly to the softlockup case, the current stack trace is displayed -upon detection and the system will stay locked up unless the default -behavior is changed, which can be done through a sysctl, -'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", -and a kernel parameter, "nmi_watchdog" -(see "Documentation/admin-guide/kernel-parameters.rst" for details). - -The panic option can be used in combination with panic_timeout (this -timeout is set through the confusingly named "kernel.panic" sysctl), -to cause the system to reboot automatically after a specified amount -of time. - -Implementation -============== - -The soft and hard lockup detectors are built on top of the hrtimer and -perf subsystems, respectively. A direct consequence of this is that, -in principle, they should work in any architecture where these -subsystems are present. - -A periodic hrtimer runs to generate interrupts and kick the watchdog -task. An NMI perf event is generated every "watchdog_thresh" -(compile-time initialized to 10 and configurable through sysctl of the -same name) seconds to check for hardlockups. If any CPU in the system -does not receive any hrtimer interrupt during that time the -'hardlockup detector' (the handler for the NMI perf event) will -generate a kernel warning or call panic, depending on the -configuration. - -The watchdog task is a high priority kernel thread that updates a -timestamp every time it is scheduled. If that timestamp is not updated -for 2*watchdog_thresh seconds (the softlockup threshold) the -'softlockup detector' (coded inside the hrtimer callback function) -will dump useful debug information to the system log, after which it -will call panic if it was instructed to do so or resume execution of -other kernel code. - -The period of the hrtimer is 2*watchdog_thresh/5, which means it has -two or three chances to generate an interrupt before the hardlockup -detector kicks in. - -As explained above, a kernel knob is provided that allows -administrators to configure the period of the hrtimer and the perf -event. The right value for a particular environment is a trade-off -between fast response to lockups and detection overhead. - -By default, the watchdog runs on all online cores. However, on a -kernel configured with NO_HZ_FULL, by default the watchdog runs only -on the housekeeping cores, not the cores specified in the "nohz_full" -boot argument. If we allowed the watchdog to run by default on -the "nohz_full" cores, we would have to run timer ticks to activate -the scheduler, which would prevent the "nohz_full" functionality -from protecting the user code on those cores from the kernel. -Of course, disabling it by default on the nohz_full cores means that -when those cores do enter the kernel, by default we will not be -able to detect if they lock up. However, allowing the watchdog -to continue to run on the housekeeping (non-tickless) cores means -that we will continue to detect lockups properly on those cores. - -In either case, the set of cores excluded from running the watchdog -may be adjusted via the kernel.watchdog_cpumask sysctl. For -nohz_full cores, this may be useful for debugging a case where the -kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt deleted file mode 100644 index aaf1667489f8..000000000000 --- a/Documentation/numastat.txt +++ /dev/null @@ -1,30 +0,0 @@ -=============================== -Numa policy hit/miss statistics -=============================== - -/sys/devices/system/node/node*/numastat - -All units are pages. Hugepages have separate counters. - -=============== ============================================================ -numa_hit A process wanted to allocate memory from this node, - and succeeded. - -numa_miss A process wanted to allocate memory from another node, - but ended up with memory from this node. - -numa_foreign A process wanted to allocate on this node, - but ended up with memory from another one. - -local_node A process ran on this node and got memory from it. - -other_node A process ran on this node and got memory from another node. - -interleave_hit Interleaving wanted to allocate from this node - and succeeded. -=============== ============================================================ - -For easier reading you can use the numastat utility from the numactl package -(http://oss.sgi.com/projects/libnuma/). Note that it only works -well right now on machines with a small number of CPUs. - diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt deleted file mode 100644 index bab2d10631f0..000000000000 --- a/Documentation/pnp.txt +++ /dev/null @@ -1,292 +0,0 @@ -================================= -Linux Plug and Play Documentation -================================= - -:Author: Adam Belay -:Last updated: Oct. 16, 2002 - - -Overview --------- - -Plug and Play provides a means of detecting and setting resources for legacy or -otherwise unconfigurable devices. The Linux Plug and Play Layer provides these -services to compatible drivers. - - -The User Interface ------------------- - -The Linux Plug and Play user interface provides a means to activate PnP devices -for legacy and user level drivers that do not support Linux Plug and Play. The -user interface is integrated into sysfs. - -In addition to the standard sysfs file the following are created in each -device's directory: -- id - displays a list of support EISA IDs -- options - displays possible resource configurations -- resources - displays currently allocated resources and allows resource changes - -activating a device -^^^^^^^^^^^^^^^^^^^ - -:: - - # echo "auto" > resources - -this will invoke the automatic resource config system to activate the device - -manually activating a device -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - # echo "manual " > resources - - - the configuration number - - static or dynamic - static = for next boot - dynamic = now - -disabling a device -^^^^^^^^^^^^^^^^^^ - -:: - - # echo "disable" > resources - - -EXAMPLE: - -Suppose you need to activate the floppy disk controller. - -1. change to the proper directory, in my case it is - /driver/bus/pnp/devices/00:0f:: - - # cd /driver/bus/pnp/devices/00:0f - # cat name - PC standard floppy disk controller - -2. check if the device is already active:: - - # cat resources - DISABLED - - - Notice the string "DISABLED". This means the device is not active. - -3. check the device's possible configurations (optional):: - - # cat options - Dependent: 01 - Priority acceptable - port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding - port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding - irq 6 - dma 2 8-bit compatible - Dependent: 02 - Priority acceptable - port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding - port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding - irq 6 - dma 2 8-bit compatible - -4. now activate the device:: - - # echo "auto" > resources - -5. finally check if the device is active:: - - # cat resources - io 0x3f0-0x3f5 - io 0x3f7-0x3f7 - irq 6 - dma 2 - -also there are a series of kernel parameters:: - - pnp_reserve_irq=irq1[,irq2] .... - pnp_reserve_dma=dma1[,dma2] .... - pnp_reserve_io=io1,size1[,io2,size2] .... - pnp_reserve_mem=mem1,size1[,mem2,size2] .... - - - -The Unified Plug and Play Layer -------------------------------- - -All Plug and Play drivers, protocols, and services meet at a central location -called the Plug and Play Layer. This layer is responsible for the exchange of -information between PnP drivers and PnP protocols. Thus it automatically -forwards commands to the proper protocol. This makes writing PnP drivers -significantly easier. - -The following functions are available from the Plug and Play Layer: - -pnp_get_protocol - increments the number of uses by one - -pnp_put_protocol - deincrements the number of uses by one - -pnp_register_protocol - use this to register a new PnP protocol - -pnp_unregister_protocol - use this function to remove a PnP protocol from the Plug and Play Layer - -pnp_register_driver - adds a PnP driver to the Plug and Play Layer - - this includes driver model integration - returns zero for success or a negative error number for failure; count - calls to the .add() method if you need to know how many devices bind to - the driver - -pnp_unregister_driver - removes a PnP driver from the Plug and Play Layer - - - -Plug and Play Protocols ------------------------ - -This section contains information for PnP protocol developers. - -The following Protocols are currently available in the computing world: - -- PNPBIOS: - used for system devices such as serial and parallel ports. -- ISAPNP: - provides PnP support for the ISA bus -- ACPI: - among its many uses, ACPI provides information about system level - devices. - -It is meant to replace the PNPBIOS. It is not currently supported by Linux -Plug and Play but it is planned to be in the near future. - - -Requirements for a Linux PnP protocol: -1. the protocol must use EISA IDs -2. the protocol must inform the PnP Layer of a device's current configuration - -- the ability to set resources is optional but preferred. - -The following are PnP protocol related functions: - -pnp_add_device - use this function to add a PnP device to the PnP layer - - only call this function when all wanted values are set in the pnp_dev - structure - -pnp_init_device - call this to initialize the PnP structure - -pnp_remove_device - call this to remove a device from the Plug and Play Layer. - it will fail if the device is still in use. - automatically will free mem used by the device and related structures - -pnp_add_id - adds an EISA ID to the list of supported IDs for the specified device - -For more information consult the source of a protocol such as -/drivers/pnp/pnpbios/core.c. - - - -Linux Plug and Play Drivers ---------------------------- - -This section contains information for Linux PnP driver developers. - -The New Way -^^^^^^^^^^^ - -1. first make a list of supported EISA IDS - - ex:: - - static const struct pnp_id pnp_dev_table[] = { - /* Standard LPT Printer Port */ - {.id = "PNP0400", .driver_data = 0}, - /* ECP Printer Port */ - {.id = "PNP0401", .driver_data = 0}, - {.id = ""} - }; - - Please note that the character 'X' can be used as a wild card in the function - portion (last four characters). - - ex:: - - /* Unknown PnP modems */ - { "PNPCXXX", UNKNOWN_DEV }, - - Supported PnP card IDs can optionally be defined. - ex:: - - static const struct pnp_id pnp_card_table[] = { - { "ANYDEVS", 0 }, - { "", 0 } - }; - -2. Optionally define probe and remove functions. It may make sense not to - define these functions if the driver already has a reliable method of detecting - the resources, such as the parport_pc driver. - - ex:: - - static int - serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const - struct pnp_id *dev_id) - { - . . . - - ex:: - - static void serial_pnp_remove(struct pnp_dev * dev) - { - . . . - - consult /drivers/serial/8250_pnp.c for more information. - -3. create a driver structure - - ex:: - - static struct pnp_driver serial_pnp_driver = { - .name = "serial", - .card_id_table = pnp_card_table, - .id_table = pnp_dev_table, - .probe = serial_pnp_probe, - .remove = serial_pnp_remove, - }; - - * name and id_table cannot be NULL. - -4. register the driver - - ex:: - - static int __init serial8250_pnp_init(void) - { - return pnp_register_driver(&serial_pnp_driver); - } - -The Old Way -^^^^^^^^^^^ - -A series of compatibility functions have been created to make it easy to convert -ISAPNP drivers. They should serve as a temporary solution only. - -They are as follows:: - - struct pnp_card *pnp_find_card(unsigned short vendor, - unsigned short device, - struct pnp_card *from) - - struct pnp_dev *pnp_find_dev(struct pnp_card *card, - unsigned short vendor, - unsigned short function, - struct pnp_dev *from) - diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt deleted file mode 100644 index 688c95b11919..000000000000 --- a/Documentation/rtc.txt +++ /dev/null @@ -1,140 +0,0 @@ -======================================= -Real Time Clock (RTC) Drivers for Linux -======================================= - -When Linux developers talk about a "Real Time Clock", they usually mean -something that tracks wall clock time and is battery backed so that it -works even with system power off. Such clocks will normally not track -the local time zone or daylight savings time -- unless they dual boot -with MS-Windows -- but will instead be set to Coordinated Universal Time -(UTC, formerly "Greenwich Mean Time"). - -The newest non-PC hardware tends to just count seconds, like the time(2) -system call reports, but RTCs also very commonly represent time using -the Gregorian calendar and 24 hour time, as reported by gmtime(3). - -Linux has two largely-compatible userspace RTC API families you may -need to know about: - - * /dev/rtc ... is the RTC provided by PC compatible systems, - so it's not very portable to non-x86 systems. - - * /dev/rtc0, /dev/rtc1 ... are part of a framework that's - supported by a wide variety of RTC chips on all systems. - -Programmers need to understand that the PC/AT functionality is not -always available, and some systems can do much more. That is, the -RTCs use the same API to make requests in both RTC frameworks (using -different filenames of course), but the hardware may not offer the -same functionality. For example, not every RTC is hooked up to an -IRQ, so they can't all issue alarms; and where standard PC RTCs can -only issue an alarm up to 24 hours in the future, other hardware may -be able to schedule one any time in the upcoming century. - - -Old PC/AT-Compatible driver: /dev/rtc --------------------------------------- - -All PCs (even Alpha machines) have a Real Time Clock built into them. -Usually they are built into the chipset of the computer, but some may -actually have a Motorola MC146818 (or clone) on the board. This is the -clock that keeps the date and time while your computer is turned off. - -ACPI has standardized that MC146818 functionality, and extended it in -a few ways (enabling longer alarm periods, and wake-from-hibernate). -That functionality is NOT exposed in the old driver. - -However it can also be used to generate signals from a slow 2Hz to a -relatively fast 8192Hz, in increments of powers of two. These signals -are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is -for...) It can also function as a 24hr alarm, raising IRQ 8 when the -alarm goes off. The alarm can also be programmed to only check any -subset of the three programmable values, meaning that it could be set to -ring on the 30th second of the 30th minute of every hour, for example. -The clock can also be set to generate an interrupt upon every clock -update, thus generating a 1Hz signal. - -The interrupts are reported via /dev/rtc (major 10, minor 135, read only -character device) in the form of an unsigned long. The low byte contains -the type of interrupt (update-done, alarm-rang, or periodic) that was -raised, and the remaining bytes contain the number of interrupts since -the last read. Status information is reported through the pseudo-file -/proc/driver/rtc if the /proc filesystem was enabled. The driver has -built in locking so that only one process is allowed to have the /dev/rtc -interface open at a time. - -A user process can monitor these interrupts by doing a read(2) or a -select(2) on /dev/rtc -- either will block/stop the user process until -the next interrupt is received. This is useful for things like -reasonably high frequency data acquisition where one doesn't want to -burn up 100% CPU by polling gettimeofday etc. etc. - -At high frequencies, or under high loads, the user process should check -the number of interrupts received since the last read to determine if -there has been any interrupt "pileup" so to speak. Just for reference, a -typical 486-33 running a tight read loop on /dev/rtc will start to suffer -occasional interrupt pileup (i.e. > 1 IRQ event since last read) for -frequencies above 1024Hz. So you really should check the high bytes -of the value you read, especially at frequencies above that of the -normal timer interrupt, which is 100Hz. - -Programming and/or enabling interrupt frequencies greater than 64Hz is -only allowed by root. This is perhaps a bit conservative, but we don't want -an evil user generating lots of IRQs on a slow 386sx-16, where it might have -a negative impact on performance. This 64Hz limit can be changed by writing -a different value to /proc/sys/dev/rtc/max-user-freq. Note that the -interrupt handler is only a few lines of code to minimize any possibility -of this effect. - -Also, if the kernel time is synchronized with an external source, the -kernel will write the time back to the CMOS clock every 11 minutes. In -the process of doing this, the kernel briefly turns off RTC periodic -interrupts, so be aware of this if you are doing serious work. If you -don't synchronize the kernel time with an external source (via ntp or -whatever) then the kernel will keep its hands off the RTC, allowing you -exclusive access to the device for your applications. - -The alarm and/or interrupt frequency are programmed into the RTC via -various ioctl(2) calls as listed in ./include/linux/rtc.h -Rather than write 50 pages describing the ioctl() and so on, it is -perhaps more useful to include a small test program that demonstrates -how to use them, and demonstrates the features of the driver. This is -probably a lot more useful to people interested in writing applications -that will be using this driver. See the code at the end of this document. - -(The original /dev/rtc driver was written by Paul Gortmaker.) - - -New portable "RTC Class" drivers: /dev/rtcN --------------------------------------------- - -Because Linux supports many non-ACPI and non-PC platforms, some of which -have more than one RTC style clock, it needed a more portable solution -than expecting a single battery-backed MC146818 clone on every system. -Accordingly, a new "RTC Class" framework has been defined. It offers -three different userspace interfaces: - - * /dev/rtcN ... much the same as the older /dev/rtc interface - - * /sys/class/rtc/rtcN ... sysfs attributes support readonly - access to some RTC attributes. - - * /proc/driver/rtc ... the system clock RTC may expose itself - using a procfs interface. If there is no RTC for the system clock, - rtc0 is used by default. More information is (currently) shown - here than through sysfs. - -The RTC Class framework supports a wide variety of RTCs, ranging from those -integrated into embeddable system-on-chip (SOC) processors to discrete chips -using I2C, SPI, or some other bus to communicate with the host CPU. There's -even support for PC-style RTCs ... including the features exposed on newer PCs -through ACPI. - -The new framework also removes the "one RTC per system" restriction. For -example, maybe the low-power battery-backed RTC is a discrete I2C chip, but -a high functionality RTC is integrated into the SOC. That system might read -the system clock from the discrete RTC, but use the integrated one for all -other tasks, because of its greater functionality. - -Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the -ioctl interface. diff --git a/Documentation/svga.txt b/Documentation/svga.txt deleted file mode 100644 index b6c2f9acca92..000000000000 --- a/Documentation/svga.txt +++ /dev/null @@ -1,249 +0,0 @@ -.. include:: - -================================= -Video Mode Selection Support 2.13 -================================= - -:Copyright: |copy| 1995--1999 Martin Mares, - -Intro -~~~~~ - -This small document describes the "Video Mode Selection" feature which -allows the use of various special video modes supported by the video BIOS. Due -to usage of the BIOS, the selection is limited to boot time (before the -kernel decompression starts) and works only on 80X86 machines. - -.. note:: - - Short intro for the impatient: Just use vga=ask for the first time, - enter ``scan`` on the video mode prompt, pick the mode you want to use, - remember its mode ID (the four-digit hexadecimal number) and then - set the vga parameter to this number (converted to decimal first). - -The video mode to be used is selected by a kernel parameter which can be -specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..." -option of LILO (or some other boot loader you use) or by the "vidmode" utility -(present in standard Linux utility packages). You can use the following values -of this parameter:: - - NORMAL_VGA - Standard 80x25 mode available on all display adapters. - - EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA. - - ASK_VGA - Display a video mode menu upon startup (see below). - - 0..35 - Menu item number (when you have used the menu to view the list of - modes available on your adapter, you can specify the menu item you want - to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the - mode list displayed may vary as the kernel version changes, because the - modes are listed in a "first detected -- first displayed" manner. It's - better to use absolute mode numbers instead. - - 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below - for exact meaning of the ID). Warning: rdev and LILO don't support - hexadecimal numbers -- you have to convert it to decimal manually. - -Menu -~~~~ - -The ASK_VGA mode causes the kernel to offer a video mode menu upon -bootup. It displays a "Press to see video modes available, -to continue or wait 30 secs" message. If you press , you enter the -menu, if you press or wait 30 seconds, the kernel will boot up in -the standard 80x25 mode. - -The menu looks like:: - - Video adapter: - Mode: COLSxROWS: - 0 0F00 80x25 - 1 0F01 80x50 - 2 0F02 80x43 - 3 0F03 80x26 - .... - Enter mode number or ``scan``: - - tells what video adapter did Linux detect --- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA -with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection -of chipsets is turned off by default as it's inherently unreliable due to -absolutely insane PC design. - -"0 0F00 80x25" means that the first menu item (the menu items are numbered -from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the -next section for a description of mode IDs). - - encourages you to enter the item number or mode ID -you wish to set and press . If the computer complains something about -"Unknown mode ID", it is trying to tell you that it isn't possible to set such -a mode. It's also possible to press only which leaves the current mode. - -The mode list usually contains a few basic modes and some VESA modes. In -case your chipset has been detected, some chipset-specific modes are shown as -well (some of these might be missing or unusable on your machine as different -BIOSes are often shipped with the same card and the mode numbers depend purely -on the VGA BIOS). - -The modes displayed on the menu are partially sorted: The list starts with -the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and -80x43), local modes (if the local modes feature is enabled), VESA modes and -finally SVGA modes for the auto-detected adapter. - -If you are not happy with the mode list offered (e.g., if you think your card -is able to do more), you can enter "scan" instead of item number / mode ID. The -program will try to ask the BIOS for all possible video mode numbers and test -what happens then. The screen will be probably flashing wildly for some time and -strange noises will be heard from inside the monitor and so on and then, really -all consistent video modes supported by your BIOS will appear (plus maybe some -``ghost modes``). If you are afraid this could damage your monitor, don't use -this function. - -After scanning, the mode ordering is a bit different: the auto-detected SVGA -modes are not listed at all and the modes revealed by ``scan`` are shown before -all VESA modes. - -Mode IDs -~~~~~~~~ - -Because of the complexity of all the video stuff, the video mode IDs -used here are also a bit complex. A video mode ID is a 16-bit number usually -expressed in a hexadecimal notation (starting with "0x"). You can set a mode -by entering its mode directly if you know it even if it isn't shown on the menu. - -The ID numbers can be divided to those regions:: - - 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use - outside the menu as this can change from boot to boot (especially if you - have used the ``scan`` feature). - - 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number - (as presented to INT 10, function 00) increased by 0x0100. - - 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by - 0x0100. All VESA modes should be autodetected and shown on the menu. - - 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05. - (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60, - 945=132x28 for the standard Video7 BIOS) - - 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually - by modifying one of the standard modes). Currently available: - 0x0f00 standard 80x25, don't reset mode if already set (=FFFF) - 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA - 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font) - 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font) - 0x0f04 leave current video mode - 0x0f05 VGA 80x30 (480 scans, 16-point font) - 0x0f06 VGA 80x34 (480 scans, 14-point font) - 0x0f07 VGA 80x60 (480 scans, 8-point font) - 0x0f08 Graphics hack (see the VIDEO_GFX_HACK paragraph below) - - 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC" - form where RR is a number of rows and CC is a number of columns. - E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc. - This is the only fully portable way to refer to a non-standard mode, - but it relies on the mode being found and displayed on the menu - (remember that mode scanning is not done automatically). - - 0xff00 to 0xffff - aliases for backward compatibility: - 0xffff equivalent to 0x0f00 (standard 80x25) - 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50) - -If you add 0x8000 to the mode ID, the program will try to recalculate -vertical display timing according to mode parameters, which can be used to -eliminate some annoying bugs of certain VGA BIOSes (usually those used for -cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the -end of the display. - -Options -~~~~~~~ - -Build options for arch/x86/boot/* are selected by the kernel kconfig -utility and the kernel .config file. - -VIDEO_GFX_HACK - includes special hack for setting of graphics modes -to be used later by special drivers. -Allows to set _any_ BIOS mode including graphic ones and forcing specific -text screen resolution instead of peeking it from BIOS variables. Don't use -unless you think you know what you're doing. To activate this setup, use -mode number 0x0f08 (see the Mode IDs section above). - -Still doesn't work? -~~~~~~~~~~~~~~~~~~~ - -When the mode detection doesn't work (e.g., the mode list is incorrect or -the machine hangs instead of displaying the menu), try to switch off some of -the configuration options listed under "Options". If it fails, you can still use -your kernel with the video mode set directly via the kernel parameter. - -In either case, please send me a bug report containing what _exactly_ -happens and how do the configuration switches affect the behaviour of the bug. - -If you start Linux from M$-DOS, you might also use some DOS tools for -video mode setting. In this case, you must specify the 0x0f04 mode ("leave -current settings") to Linux, because if you don't and you use any non-standard -mode, Linux will switch to 80x25 automatically. - -If you set some extended mode and there's one or more extra lines on the -bottom of the display containing already scrolled-out text, your VGA BIOS -contains the most common video BIOS bug called "incorrect vertical display -end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately, -this must be done manually -- no autodetection mechanisms are available. - -History -~~~~~~~ - -=============== ================================================================ -1.0 (??-Nov-95) First version supporting all adapters supported by the old - setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels - and then removed due to instability on some machines. -2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost - everything is configurable, the VESA support should be much more - stable, explicit mode numbering allowed, "scan" implemented etc. -2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution - supported. Few bugs fixed. VESA modes are listed prior to - modes supplied by SVGA autodetection as they are more reliable. - CLGD autodetect works better. Doesn't depend on 80x25 being - active when started. Scanning fixed. 80x43 (any VGA) added. - Code cleaned up. -2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX - VESA modes work now). Display end bug workaround supported. - Special modes renumbered to allow adding of the "recalculate" - flag, 0xffff and 0xfffe became aliases instead of real IDs. - Screen contents retained during mode changes. -2.3 (15-Mar-96) Changed to work with 1.3.74 kernel. -2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem - with some boot loaders. Memory management rewritten to reflect - these changes. Unfortunately, screen contents retaining works - only with some loaders now. - Added a Tseng 132x60 mode. -2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4. -2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on - several cards with broken VESA code (e.g., ATI VGA). -2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some - cards use very strange mode numbers. - - Added Realtek VGA modes (thanks to Gonzalo Tornaria). - - Hardware testing order slightly changed, tests based on ROM - contents done as first. - - Added support for special Video7 mode switching functions - (thanks to Tom Vander Aa). - - Added 480-scanline modes (especially useful for notebooks, - original version written by hhanemaa@cs.ruu.nl, patched by - Jeff Chua, rewritten by me). - - Screen store/restore fixed. -2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA. - - Better recognition of text modes during mode scan. -2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!) -2.10(11-Nov-96) - The whole thing made optional. - - Added the CONFIG_VIDEO_400_HACK switch. - - Added the CONFIG_VIDEO_GFX_HACK switch. - - Code cleanup. -2.11(03-May-97) - Yet another cleanup, now including also the documentation. - - Direct testing of SVGA adapters turned off by default, ``scan`` - offered explicitly on the prompt line. - - Removed the doc section describing adding of new probing - functions as I try to get rid of _all_ hardware probing here. -2.12(25-May-98) Added support for VESA frame buffer graphics. -2.13(14-May-99) Minor documentation fixes. -=============== ================================================================ diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt deleted file mode 100644 index 56d6fa2e2368..000000000000 --- a/Documentation/video-output.txt +++ /dev/null @@ -1,34 +0,0 @@ -Video Output Switcher Control -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -2006 luming.yu@intel.com - -The output sysfs class driver provides an abstract video output layer that -can be used to hook platform specific methods to enable/disable video output -device through common sysfs interface. For example, on my IBM ThinkPad T42 -laptop, The ACPI video driver registered its output devices and read/write -method for 'state' with output sysfs class. The user interface under sysfs is:: - - linux:/sys/class/video_output # tree . - . - |-- CRT0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - |-- DVI0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - |-- LCD0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - `-- TV0 - |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - |-- state - |-- subsystem -> ../../../class/video_output - `-- uevent - diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst index 8e9704f61017..e29739904e37 100644 --- a/Documentation/x86/topology.rst +++ b/Documentation/x86/topology.rst @@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the respective code. The architecture-agnostic topology definitions are in -Documentation/cputopology.txt. This file holds x86-specific +Documentation/admin-guide/cputopology.rst. This file holds x86-specific differences/specialities which must not necessarily apply to the generic definitions. Thus, the way to read up on Linux topology on x86 is to start with the generic one and look at this one in parallel for the x86 specifics. diff --git a/MAINTAINERS b/MAINTAINERS index c1593a668f80..570572627fd1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6080,7 +6080,7 @@ M: Ard Biesheuvel L: linux-efi@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git S: Maintained -F: Documentation/efi-stub.txt +F: Documentation/admin-guide/efi-stub.rst F: arch/*/kernel/efi.c F: arch/x86/boot/compressed/eboot.[ch] F: arch/*/include/asm/efi.h @@ -7088,7 +7088,7 @@ M: Herbert Xu L: linux-crypto@vger.kernel.org S: Odd fixes F: Documentation/devicetree/bindings/rng/ -F: Documentation/hw_random.txt +F: Documentation/admin-guide/hw_random.rst F: drivers/char/hw_random/ F: include/linux/hw_random.h @@ -9398,7 +9398,7 @@ M: "Richard Russon (FlatCap)" L: linux-ntfs-dev@lists.sourceforge.net W: http://www.linux-ntfs.org/content/view/19/37/ S: Maintained -F: Documentation/ldm.txt +F: Documentation/admin-guide/ldm.rst F: block/partitions/ldm.* LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI) @@ -12058,7 +12058,7 @@ PARALLEL LCD/KEYPAD PANEL DRIVER M: Willy Tarreau M: Ksenija Stanojevic S: Odd Fixes -F: Documentation/auxdisplay/lcd-panel-cgram.rst +F: Documentation/admin-guide/lcd-panel-cgram.rst F: drivers/auxdisplay/panel.c PARALLEL PORT SUBSYSTEM @@ -13476,7 +13476,7 @@ Q: http://patchwork.ozlabs.org/project/rtc-linux/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git S: Maintained F: Documentation/devicetree/bindings/rtc/ -F: Documentation/rtc.txt +F: Documentation/admin-guide/rtc.rst F: drivers/rtc/ F: include/linux/rtc.h F: include/uapi/linux/rtc.h @@ -15306,7 +15306,7 @@ SVGA HANDLING M: Martin Mares L: linux-video@atrey.karlin.mff.cuni.cz S: Maintained -F: Documentation/svga.txt +F: Documentation/admin-guide/svga.rst F: arch/x86/boot/video* SWIOTLB SUBSYSTEM diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 20afd6077465..600c5ba1af41 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1297,7 +1297,7 @@ config SMP will run faster if you say N here. See also , - and the SMP-HOWTO available at + and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 42875ff15671..6d732e451071 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -277,7 +277,7 @@ config SMP machines, but will use only one CPU of a multiprocessor machine. On a uniprocessor machine, the kernel will run faster if you say N. - See also and the SMP-HOWTO + See also and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index c2858ac6a46a..6b1b5941b618 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -679,7 +679,7 @@ config SMP People using multiprocessor machines who say Y here should also say Y to "Enhanced Real Time Clock Support", below. - See also and the SMP-HOWTO + See also and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e9f5d62e9817..7926a2e11bdc 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -180,7 +180,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also and the SMP-HOWTO + See also and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9505066b7ba3..9e95af666b33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -402,7 +402,7 @@ config SMP Management" code will be disabled if you say Y here. See also , - and the SMP-HOWTO available at + and the SMP-HOWTO available at . If you don't know what to do here, say N. @@ -1959,7 +1959,7 @@ config EFI_STUB This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. - See Documentation/efi-stub.txt for more information. + See Documentation/admin-guide/efi-stub.rst for more information. config EFI_MIXED bool "EFI mixed-mode support" diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 37b9710cc80a..702689a628f0 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -194,7 +194,7 @@ config LDM_PARTITION Normal partitions are now called Basic Disks under Windows 2000, XP, and Vista. - For a fuller description read . + For a fuller description read . If unsure, say N. diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 442403abd73a..3e866885a405 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -291,7 +291,7 @@ config RTC and set the RTC in an SMP compatible fashion. If you think you have a use for such a device (such as periodic data - sampling), then say Y here, and read + sampling), then say Y here, and read for details. To compile this driver as a module, choose M here: the @@ -313,7 +313,7 @@ config JS_RTC /dev/rtc. If you think you have a use for such a device (such as periodic data - sampling), then say Y here, and read + sampling), then say Y here, and read for details. To compile this driver as a module, choose M here: the diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 95be7228f327..9044d31ab1a1 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -4,7 +4,7 @@ * Copyright 2006 Michael Buesch * Copyright 2005 (c) MontaVista Software, Inc. * - * Please read Documentation/hw_random.txt for details on use. + * Please read Documentation/admin-guide/hw_random.rst for details on use. * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c0b93e0ff0c0..8e6dd908da21 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -1,7 +1,7 @@ /* Hardware Random Number Generator - Please read Documentation/hw_random.txt for details on use. + Please read Documentation/admin-guide/hw_random.rst for details on use. ---------------------------------------------------------- This software may be used and distributed according to the terms -- cgit v1.2.3-55-g7522 From c7ca0b614513afba57824cae68447f9c32b1ee61 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 15 Jul 2019 07:21:44 -0700 Subject: Revert "x86/ptrace: Prevent ptrace from clearing the FS/GS selector" and fix the test This reverts commit 48f5e52e916b55fb73754833efbacc7f8081a159. The ptrace ABI change was a prerequisite to the proposed design for FSGSBASE. Since FSGSBASE support has been reverted, and since I'm not convinced that the ABI was ever adequately tested, revert the ABI change as well. This also modifies the test case so that it tests the preexisting behavior. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fca39c478ea7fb15bc76fe8a36bd180810a067f6.1563200250.git.luto@kernel.org --- arch/x86/kernel/ptrace.c | 14 ++++++++++++-- tools/testing/selftests/x86/fsgsbase.c | 22 ++++------------------ 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 71691a8310e7..0fdbe89d0754 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -369,12 +369,22 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - x86_fsbase_write_task(child, value); + /* + * When changing the FS base, use do_arch_prctl_64() + * to set the index to zero and to set the base + * as requested. + */ + if (child->thread.fsbase != value) + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ if (value >= TASK_SIZE_MAX) return -EIO; - x86_gsbase_write_task(child, value); + if (child->thread.gsbase != value) + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 5ab4c60c100e..15a329da59fa 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,25 +489,11 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != *shared_scratch) { - nerrs++; - printf("[FAIL]\tGS changed to %lx\n", gs); - - /* - * On older kernels, poking a nonzero value into the - * base would zero the selector. On newer kernels, - * this behavior has changed -- poking the base - * changes only the base and, if FSGSBASE is not - * available, this may have no effect. - */ - if (gs == 0) - printf("\tNote: this is expected behavior on older kernels.\n"); - } else if (have_fsgsbase && (base != 0xFF)) { - nerrs++; - printf("[FAIL]\tGSBASE changed to %lx\n", base); + if (gs == 0 && base == 0xFF) { + printf("[OK]\tGS was reset as expected\n"); } else { - printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); - printf("\n"); + nerrs++; + printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); } } -- cgit v1.2.3-55-g7522 From 9087c37584fb7d8315877bb55f85e4268cc0b4f4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jul 2019 19:01:19 +0000 Subject: dma-direct: Force unencrypted DMA under SME for certain DMA masks If a device doesn't support DMA to a physical address that includes the encryption bit (currently bit 47, so 48-bit DMA), then the DMA must occur to unencrypted memory. SWIOTLB is used to satisfy that requirement if an IOMMU is not active (enabled or configured in passthrough mode). However, commit fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations") modified the coherent allocation support in SWIOTLB to use the DMA direct coherent allocation support. When an IOMMU is not active, this resulted in dma_alloc_coherent() failing for devices that didn't support DMA addresses that included the encryption bit. Addressing this requires changes to the force_dma_unencrypted() function in kernel/dma/direct.c. Since the function is now non-trivial and SME/SEV specific, update the DMA direct support to add an arch override for the force_dma_unencrypted() function. The arch override is selected when CONFIG_AMD_MEM_ENCRYPT is set. The arch override function resides in the arch/x86/mm/mem_encrypt.c file and forces unencrypted DMA when either SEV is active or SME is active and the device does not support DMA to physical addresses that include the encryption bit. Fixes: fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations") Suggested-by: Christoph Hellwig Signed-off-by: Tom Lendacky Acked-by: Thomas Gleixner [hch: moved the force_dma_unencrypted declaration to dma-mapping.h, fold the s390 fix from Halil Pasic] Signed-off-by: Christoph Hellwig --- arch/s390/Kconfig | 1 + arch/s390/mm/init.c | 7 ++++++- arch/x86/Kconfig | 1 + arch/x86/mm/mem_encrypt.c | 30 ++++++++++++++++++++++++++++++ include/linux/dma-direct.h | 9 +++++++++ kernel/dma/Kconfig | 3 +++ kernel/dma/direct.c | 16 ++++------------ 7 files changed, 54 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5d8570ed6cab..a4ad2733eedf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -189,6 +189,7 @@ config S390 select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME select HAVE_NMI + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select SWIOTLB select GENERIC_ALLOCATOR diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f0bee6af3960..78c319c5ce48 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include @@ -161,6 +161,11 @@ bool sev_active(void) return is_prot_virt_guest(); } +bool force_dma_unencrypted(struct device *dev) +{ + return sev_active(); +} + /* protected virtualization */ static void pv_init(void) { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 879741336771..d1afe92bf994 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1528,6 +1528,7 @@ config AMD_MEM_ENCRYPT depends on X86_64 && CPU_SUP_AMD select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED ---help--- Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index e0df96fdfe46..c805f0a5c16e 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -15,6 +15,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -348,6 +352,32 @@ bool sev_active(void) } EXPORT_SYMBOL(sev_active); +/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ +bool force_dma_unencrypted(struct device *dev) +{ + /* + * For SEV, all DMA must be to unencrypted addresses. + */ + if (sev_active()) + return true; + + /* + * For SME, all DMA must be to unencrypted addresses if the + * device does not support DMA to addresses that include the + * encryption mask. + */ + if (sme_active()) { + u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask)); + u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask, + dev->bus_dma_mask); + + if (dma_dev_mask <= dma_enc_mask) + return true; + } + + return false; +} + /* Architecture __weak replacement functions */ void __init mem_encrypt_free_decrypted_mem(void) { diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index b7338702592a..adf993a3bd58 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -32,6 +32,15 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ +#ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED +bool force_dma_unencrypted(struct device *dev); +#else +static inline bool force_dma_unencrypted(struct device *dev) +{ + return false; +} +#endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */ + /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. The raw __phys_to_dma diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 70f8f8d9200e..9decbba255fc 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -48,6 +48,9 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN config ARCH_HAS_DMA_MMAP_PGPROT bool +config ARCH_HAS_FORCE_DMA_UNENCRYPTED + bool + config DMA_NONCOHERENT_CACHE_SYNC bool diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b90e1aede743..d7cec866d16b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,14 +23,6 @@ #define ARCH_ZONE_DMA_BITS 24 #endif -/* - * For AMD SEV all DMA must be to unencrypted addresses. - */ -static inline bool force_dma_unencrypted(void) -{ - return sev_active(); -} - static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { if (!dev->dma_mask) { @@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { - if (force_dma_unencrypted()) + if (force_dma_unencrypted(dev)) return __phys_to_dma(dev, phys); return phys_to_dma(dev, phys); } @@ -67,7 +59,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) dma_mask = dev->bus_dma_mask; - if (force_dma_unencrypted()) + if (force_dma_unencrypted(dev)) *phys_mask = __dma_to_phys(dev, dma_mask); else *phys_mask = dma_to_phys(dev, dma_mask); @@ -159,7 +151,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted()) { + if (force_dma_unencrypted(dev)) { set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); *dma_handle = __phys_to_dma(dev, page_to_phys(page)); } else { @@ -192,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (force_dma_unencrypted()) + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && -- cgit v1.2.3-55-g7522 From e74bd96989dd42a51a73eddb4a5510a6f5e42ac3 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 9 Jul 2019 19:44:03 -0700 Subject: x86/boot: Fix memory leak in default_get_smp_config() When default_get_smp_config() is called with early == 1 and mpf->feature1 is non-zero, mpf is leaked because the return path does not do early_memunmap(). Fix this and share a common exit routine. Fixes: 5997efb96756 ("x86/boot: Use memremap() to map the MPF and MPC data") Reported-by: Cfir Cohen Signed-off-by: David Rientjes Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907091942570.28240@chino.kir.corp.google.com --- arch/x86/kernel/mpparse.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 1bfe5c6e6cfe..afac7ccce72f 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -546,17 +546,15 @@ void __init default_get_smp_config(unsigned int early) * local APIC has default address */ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - return; + goto out; } pr_info("Default MP configuration #%d\n", mpf->feature1); construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) { - early_memunmap(mpf, sizeof(*mpf)); - return; - } + if (check_physptr(mpf, early)) + goto out; } else BUG(); @@ -565,7 +563,7 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ - +out: early_memunmap(mpf, sizeof(*mpf)); } -- cgit v1.2.3-55-g7522 From ffdb07f31252625b7bcbf1f424d7beccff02ba97 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 10 Jul 2019 13:19:35 -0700 Subject: x86/mm: Free sme_early_buffer after init The contents of sme_early_buffer should be cleared after __sme_early_enc_dec() because it is used to move encrypted and decrypted data, but since __sme_early_enc_dec() is __init this buffer simply can be freed after init. This saves a page that is otherwise unreferenced after init. Reported-by: Cfir Cohen Signed-off-by: David Rientjes Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907101318170.197432@chino.kir.corp.google.com --- arch/x86/mm/mem_encrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index e0df96fdfe46..e94e0a62ba92 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(sev_enable_key); bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ -static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); +static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); /* * This routine does not change the underlying encryption setting of the -- cgit v1.2.3-55-g7522 From ec6335586953b0df32f83ef696002063090c7aef Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 8 Jul 2019 17:36:45 -0400 Subject: x86/apic: Silence -Wtype-limits compiler warnings There are many compiler warnings like this, In file included from ./arch/x86/include/asm/smp.h:13, from ./arch/x86/include/asm/mmzone_64.h:11, from ./arch/x86/include/asm/mmzone.h:5, from ./include/linux/mmzone.h:969, from ./include/linux/gfp.h:6, from ./include/linux/mm.h:10, from arch/x86/kernel/apic/io_apic.c:34: arch/x86/kernel/apic/io_apic.c: In function 'check_timer': ./arch/x86/include/asm/apic.h:37:11: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] if ((v) <= apic_verbosity) \ ^~ arch/x86/kernel/apic/io_apic.c:2160:2: note: in expansion of macro 'apic_printk' apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " ^~~~~~~~~~~ ./arch/x86/include/asm/apic.h:37:11: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] if ((v) <= apic_verbosity) \ ^~ arch/x86/kernel/apic/io_apic.c:2207:4: note: in expansion of macro 'apic_printk' apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " ^~~~~~~~~~~ APIC_QUIET is 0, so silence them by making apic_verbosity type int. Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1562621805-24789-1-git-send-email-cai@lca.pw --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/apic.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 050e5f9ebf81..e647aa095867 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -49,7 +49,7 @@ static inline void generic_apic_probe(void) #ifdef CONFIG_X86_LOCAL_APIC -extern unsigned int apic_verbosity; +extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1bd91cb7b320..f5291362da1a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -unsigned int apic_verbosity; +int apic_verbosity; int pic_mode; -- cgit v1.2.3-55-g7522 From f709f81483d652b4ae5bbda2204b95593ce07c8f Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 15 Jul 2019 10:47:09 +0800 Subject: x86/e820: Use proper booleans instead of 0/1 This fixes the following coccinelle warning: ./arch/x86/kernel/e820.c:89:9-10: WARNING: return of 0/1 in function '_e820__mapped_any' with return type bool Return type bool instead of 0/1. Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1563158829-44373-1-git-send-email-wang.yi59@zte.com.cn --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e69408bf664b..7da2bcd2b8eb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -86,9 +86,9 @@ static bool _e820__mapped_any(struct e820_table *table, continue; if (entry->addr >= end || entry->addr + entry->size <= start) continue; - return 1; + return true; } - return 0; + return false; } bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) -- cgit v1.2.3-55-g7522 From 29e7e9664aec17b94a9c8c5a75f8d216a206aa3a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 12 Jul 2019 11:08:05 +0200 Subject: x86: math-emu: Hide clang warnings for 16-bit overflow clang warns about a few parts of the math-emu implementation where a 16-bit integer becomes negative during assignment: arch/x86/math-emu/poly_tan.c:88:35: error: implicit conversion from 'int' to 'short' changes value from 49216 to -16320 [-Werror,-Wconstant-conversion] (0x41 + EXTENDED_Ebias) | SIGN_Negative); ~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~ arch/x86/math-emu/fpu_emu.h:180:58: note: expanded from macro 'setexponent16' #define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); } ~ ^ arch/x86/math-emu/reg_constant.c:37:32: error: implicit conversion from 'int' to 'short' changes value from 49085 to -16451 [-Werror,-Wconstant-conversion] FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, ^~~~~~~~~~~~~~~~~~ arch/x86/math-emu/reg_constant.c:21:25: note: expanded from macro 'MAKE_REG' ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~ arch/x86/math-emu/reg_constant.c:48:28: error: implicit conversion from 'int' to 'short' changes value from 65535 to -1 [-Werror,-Wconstant-conversion] FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/x86/math-emu/reg_constant.c:21:25: note: expanded from macro 'MAKE_REG' ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~ The code is correct as is, so add a typecast to shut up the warnings. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190712090816.350668-1-arnd@arndb.de --- arch/x86/math-emu/fpu_emu.h | 2 +- arch/x86/math-emu/reg_constant.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index a5a41ec58072..0c122226ca56 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -177,7 +177,7 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y) #define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \ ((y) + EXTENDED_Ebias) & 0x7fff; } #define exponent16(x) (*(short *)&((x)->exp)) -#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); } +#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (u16)(y); } #define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); } #define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; } diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index 8dc9095bab22..742619e94bdf 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -18,7 +18,7 @@ #include "control_w.h" #define MAKE_REG(s, e, l, h) { l, h, \ - ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } + (u16)((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); #if 0 -- cgit v1.2.3-55-g7522 From 50e04acf2990d0d93983720b0a85b11ef805df60 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 13 Jul 2019 00:41:52 +0200 Subject: x86/process: Delete useless check for dead process with LDT At release_thread(), ->mm is NULL; and it is fine for the former mm to still have an LDT. Delete this check in process_64.c, similar to commit 2684927c6b93 ("[PATCH] x86: Deprecate useless bug"), which did the same in process_32.c. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190712224152.13129-1-jannh@google.com --- arch/x86/kernel/process_64.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 250e4c4ac6d9..af64519b2695 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -143,17 +143,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) void release_thread(struct task_struct *dead_task) { - if (dead_task->mm) { -#ifdef CONFIG_MODIFY_LDT_SYSCALL - if (dead_task->mm->context.ldt) { - pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt->entries, - dead_task->mm->context.ldt->nr_entries); - BUG(); - } -#endif - } + WARN_ON(dead_task->mm); } enum which_selector { -- cgit v1.2.3-55-g7522 From 3a7f0adfe7c27cdaf6dc3456226a430398732e2c Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Tue, 16 Jul 2019 16:27:04 -0700 Subject: arch/*: remove unused isa_page_to_bus() isa_page_to_bus() is deprecated and is no longer used anywhere. Remove it entirely. Link: http://lkml.kernel.org/r/20190613161155.16946-1-steve@sk2.org Signed-off-by: Stephen Kitt Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/io.h | 5 ----- arch/arm/include/asm/io.h | 1 - arch/mips/include/asm/io.h | 2 -- arch/x86/include/asm/io.h | 1 - 4 files changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index ccf9d65166bb..af2c0063dc75 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -93,11 +93,6 @@ static inline void * phys_to_virt(unsigned long address) #define page_to_phys(page) page_to_pa(page) -static inline dma_addr_t __deprecated isa_page_to_bus(struct page *page) -{ - return page_to_phys(page); -} - /* Maximum PIO space address supported? */ #define IO_SPACE_LIMIT 0xffff diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index f11c35cf0b74..7a0596fcb2e7 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -30,7 +30,6 @@ * ISA I/O bus memory addresses are 1:1 with the physical address. */ #define isa_virt_to_bus virt_to_phys -#define isa_page_to_bus page_to_phys #define isa_bus_to_virt phys_to_virt /* diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 29997e42480e..1790274c27eb 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -149,8 +149,6 @@ static inline void *isa_bus_to_virt(unsigned long address) return phys_to_virt(address); } -#define isa_page_to_bus page_to_phys - /* * However PCI ones are not necessarily 1:1 and therefore these interfaces * are forbidden in portable PCI drivers. diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index a06a9f8294ea..6bed97ff6db2 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -165,7 +165,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) { return (unsigned int)virt_to_phys(address); } -#define isa_page_to_bus(page) ((unsigned int)page_to_phys(page)) #define isa_bus_to_virt phys_to_virt /* -- cgit v1.2.3-55-g7522 From 0f472d04f59ff89d15b2a1c4eafde7317ddd67a2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 16 Jul 2019 16:27:33 -0700 Subject: mm/ioremap: probe platform for p4d huge map support Finish up what commit c2febafc6773 ("mm: convert generic code to 5-level paging") started while levelling up P4D huge mapping support at par with PUD and PMD. A new arch call back arch_ioremap_p4d_supported() is added which just maintains status quo (P4D huge map not supported) on x86, arm64 and powerpc. When HAVE_ARCH_HUGE_VMAP is enabled its just a simple check from the arch about the support, hence runtime effects are minimal. Link: http://lkml.kernel.org/r/1561699231-20991-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Thomas Gleixner Acked-by: Michael Ellerman (powerpc) Cc: Catalin Marinas Cc: Will Deacon Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 5 +++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +++++ arch/x86/mm/ioremap.c | 5 +++++ include/linux/io.h | 1 + lib/ioremap.c | 2 ++ 5 files changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1b49c08dfa2b..e661469cabdd 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -942,6 +942,11 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; } +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} + int __init arch_ioremap_pud_supported(void) { /* diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 65c2ba1e1783..b4ca9e95e678 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1237,3 +1237,8 @@ int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, return 0; } } + +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index e500f1df1140..63e99f15d7cf 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -459,6 +459,11 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} + int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 diff --git a/include/linux/io.h b/include/linux/io.h index 9876e5801a9d..accac822336a 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -33,6 +33,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP void __init ioremap_huge_init(void); +int arch_ioremap_p4d_supported(void); int arch_ioremap_pud_supported(void); int arch_ioremap_pmd_supported(void); #else diff --git a/lib/ioremap.c b/lib/ioremap.c index a95161d9c883..0a2ffadc6d71 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -30,6 +30,8 @@ early_param("nohugeiomap", set_nohugeiomap); void __init ioremap_huge_init(void) { if (!ioremap_huge_disabled) { + if (arch_ioremap_p4d_supported()) + ioremap_p4d_capable = 1; if (arch_ioremap_pud_supported()) ioremap_pud_capable = 1; if (arch_ioremap_pmd_supported()) -- cgit v1.2.3-55-g7522 From b98cca444d287a63dd96df04af7fb9793567599e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 16 Jul 2019 16:28:00 -0700 Subject: mm, kprobes: generalize and rename notify_page_fault() as kprobe_page_fault() Architectures which support kprobes have very similar boilerplate around calling kprobe_fault_handler(). Use a helper function in kprobes.h to unify them, based on the x86 code. This changes the behaviour for other architectures when preemption is enabled. Previously, they would have disabled preemption while calling the kprobe handler. However, preemption would be disabled if this fault was due to a kprobe, so we know the fault was not due to a kprobe handler and can simply return failure. This behaviour was introduced in commit a980c0ef9f6d ("x86/kprobes: Refactor kprobes_fault() like kprobe_exceptions_notify()") [anshuman.khandual@arm.com: export kprobe_fault_handler()] Link: http://lkml.kernel.org/r/1561133358-8876-1-git-send-email-anshuman.khandual@arm.com Link: http://lkml.kernel.org/r/1560420444-25737-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Dave Hansen Cc: Michal Hocko Cc: Matthew Wilcox Cc: Mark Rutland Cc: Christophe Leroy Cc: Stephen Rothwell Cc: Andrey Konovalov Cc: Michael Ellerman Cc: Paul Mackerras Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Andy Lutomirski Cc: Vineet Gupta Cc: James Hogan Cc: Paul Burton Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault.c | 24 +----------------------- arch/arm64/mm/fault.c | 24 +----------------------- arch/ia64/mm/fault.c | 24 +----------------------- arch/mips/include/asm/kprobes.h | 1 + arch/mips/kernel/kprobes.c | 2 +- arch/powerpc/mm/fault.c | 23 ++--------------------- arch/s390/mm/fault.c | 16 +--------------- arch/sh/mm/fault.c | 18 ++---------------- arch/sparc/mm/fault_64.c | 16 +--------------- arch/x86/mm/fault.c | 21 ++------------------- include/linux/kprobes.h | 19 +++++++++++++++++++ 11 files changed, 32 insertions(+), 156 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 0e417233dad7..890eeaac3cbb 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -27,28 +27,6 @@ #ifdef CONFIG_MMU -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) -{ - int ret = 0; - - if (!user_mode(regs)) { - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, fsr)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) -{ - return 0; -} -#endif - /* * This is useful to dump out the page tables associated with * 'addr' in mm 'mm'. @@ -265,7 +243,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (notify_page_fault(regs, fsr)) + if (kprobe_page_fault(regs, fsr)) return 0; tsk = current; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c8c61b1eb479..9568c116ac7f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -59,28 +59,6 @@ static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr) return debug_fault_info + DBG_ESR_EVT(esr); } -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, esr)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) -{ - return 0; -} -#endif - static void data_abort_decode(unsigned int esr) { pr_alert("Data abort info:\n"); @@ -434,7 +412,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (notify_page_fault(regs, esr)) + if (kprobe_page_fault(regs, esr)) return 0; /* diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 3c3a283d3172..c2f299fe9e04 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -21,28 +21,6 @@ extern int die(char *, struct pt_regs *, long); -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, int trap) -{ - int ret = 0; - - if (!user_mode(regs)) { - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, trap)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, int trap) -{ - return 0; -} -#endif - /* * Return TRUE if ADDRESS points at a page in the kernel's mapped segment * (inside region 5, on ia64) and that page is present. @@ -116,7 +94,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re /* * This is to handle the kprobes on user space access instructions */ - if (notify_page_fault(regs, TRAP_BRKPT)) + if (kprobe_page_fault(regs, TRAP_BRKPT)) return; if (user_mode(regs)) diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h index 3cf8e4d5fa28..68b1e5d458cf 100644 --- a/arch/mips/include/asm/kprobes.h +++ b/arch/mips/include/asm/kprobes.h @@ -41,6 +41,7 @@ do { \ #define kretprobe_blacklist_size 0 void arch_remove_kprobe(struct kprobe *p); +int kprobe_fault_handler(struct pt_regs *regs, int trapnr); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index 81ba1d3c367c..6cfae2411c04 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -398,7 +398,7 @@ out: return 1; } -static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d989592b6fc8..8432c281de92 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -42,26 +42,6 @@ #include #include -static inline bool notify_page_fault(struct pt_regs *regs) -{ - bool ret = false; - -#ifdef CONFIG_KPROBES - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 11)) - ret = true; - preempt_enable(); - } -#endif /* CONFIG_KPROBES */ - - if (unlikely(debugger_fault_handler(regs))) - ret = true; - - return ret; -} - /* * Check whether the instruction inst is a store using * an update addressing form which will update r1. @@ -461,8 +441,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = page_fault_is_write(error_code); vm_fault_t fault, major = 0; bool must_retry = false; + bool kprobe_fault = kprobe_page_fault(regs, 11); - if (notify_page_fault(regs)) + if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) return 0; if (unlikely(page_fault_is_bad(error_code))) { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0ba174f779da..63507662828f 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -67,20 +67,6 @@ static int __init fault_init(void) } early_initcall(fault_init); -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - return ret; -} - /* * Find out which address space caused the exception. */ @@ -412,7 +398,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) */ clear_pt_regs_flag(regs, PIF_PER_TRAP); - if (notify_page_fault(regs)) + if (kprobe_page_fault(regs, 14)) return 0; mm = tsk->mm; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 3093bc372138..5f51456f4fc7 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -24,20 +24,6 @@ #include #include -static inline int notify_page_fault(struct pt_regs *regs, int trap) -{ - int ret = 0; - - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, trap)) - ret = 1; - preempt_enable(); - } - - return ret; -} - static void force_sig_info_fault(int si_signo, int si_code, unsigned long address) { @@ -412,14 +398,14 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(fault_in_kernel_space(address))) { if (vmalloc_fault(address) >= 0) return; - if (notify_page_fault(regs, vec)) + if (kprobe_page_fault(regs, vec)) return; bad_area_nosemaphore(regs, error_code, address); return; } - if (unlikely(notify_page_fault(regs, vec))) + if (unlikely(kprobe_page_fault(regs, vec))) return; /* Only enable interrupts if they were on before the fault */ diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 83fda4d9c3b2..2371fb6b97e4 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -38,20 +38,6 @@ int show_unhandled_signals = 1; -static inline __kprobes int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 0)) - ret = 1; - preempt_enable(); - } - return ret; -} - static void __kprobes unhandled_fault(unsigned long address, struct task_struct *tsk, struct pt_regs *regs) @@ -285,7 +271,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) fault_code = get_thread_fault_code(); - if (notify_page_fault(regs)) + if (kprobe_page_fault(regs, 0)) goto exit_exception; si_code = SEGV_MAPERR; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 794f364cb882..d1634c59ed56 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -46,23 +46,6 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static nokprobe_inline int kprobes_fault(struct pt_regs *regs) -{ - if (!kprobes_built_in()) - return 0; - if (user_mode(regs)) - return 0; - /* - * To be potentially processing a kprobe fault and to be allowed to call - * kprobe_running(), we have to be non-preemptible. - */ - if (preemptible()) - return 0; - if (!kprobe_running()) - return 0; - return kprobe_fault_handler(regs, X86_TRAP_PF); -} - /* * Prefetch quirks: * @@ -1282,7 +1265,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, return; /* kprobes don't want to hook the spurious faults: */ - if (kprobes_fault(regs)) + if (kprobe_page_fault(regs, X86_TRAP_PF)) return; /* @@ -1313,7 +1296,7 @@ void do_user_addr_fault(struct pt_regs *regs, mm = tsk->mm; /* kprobes don't want to hook the spurious faults: */ - if (unlikely(kprobes_fault(regs))) + if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 443d9800ca3f..04bdaf01112c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -458,4 +458,23 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) } #endif +/* Returns true if kprobes handled the fault */ +static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, + unsigned int trap) +{ + if (!kprobes_built_in()) + return false; + if (user_mode(regs)) + return false; + /* + * To be potentially processing a kprobe fault and to be allowed + * to call kprobe_running(), we have to be non-preemptible. + */ + if (preemptible()) + return false; + if (!kprobe_running()) + return false; + return kprobe_fault_handler(regs, trap); +} + #endif /* _LINUX_KPROBES_H */ -- cgit v1.2.3-55-g7522 From 175967318c3018d01931ac950c82adab5deb47ca Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 16 Jul 2019 16:30:47 -0700 Subject: mm: introduce ARCH_HAS_PTE_DEVMAP ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined with the long-out-of-date comment can lead to the impression than an architecture may just enable it (since __add_pages() now "comprehends device memory" for itself) and expect things to work. In practice, however, ZONE_DEVICE users have little chance of functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper dependency so the real situation is clearer. Link: http://lkml.kernel.org/r/87554aa78478a02a63f2c4cf60a847279ae3eb3b.1558547956.git.robin.murphy@arm.com Signed-off-by: Robin Murphy Acked-by: Dan Williams Reviewed-by: Ira Weiny Acked-by: Oliver O'Halloran Reviewed-by: Anshuman Khandual Cc: Michael Ellerman Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jerome Glisse Cc: Michal Hocko Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/x86/Kconfig | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable_types.h | 1 - include/linux/mm.h | 4 ++-- include/linux/pfn_t.h | 4 ++-- mm/Kconfig | 5 ++--- mm/gup.c | 2 +- 9 files changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f516796dd819..d8dcd8820369 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,6 +129,7 @@ config PPC select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 @@ -136,7 +137,6 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 62e6ea0a7650..8308f32e9782 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -90,7 +90,6 @@ #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ #define _PAGE_DEVMAP _RPAGE_SW1 /* software: ZONE_DEVICE page */ -#define __HAVE_ARCH_PTE_DEVMAP /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 879741336771..4a55bd01e918 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -70,6 +70,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 @@ -80,7 +81,6 @@ config X86 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5e0509b41986..0bc530c4eb13 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -271,7 +271,7 @@ static inline int has_transparent_hugepage(void) return boot_cpu_has(X86_FEATURE_PSE); } -#ifdef __HAVE_ARCH_PTE_DEVMAP +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pmd_devmap(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_DEVMAP); @@ -732,7 +732,7 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } -#ifdef __HAVE_ARCH_PTE_DEVMAP +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t a) { return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d6ff0bbdb394..b5e49e6bac63 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -103,7 +103,6 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) -#define __HAVE_ARCH_PTE_DEVMAP #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) diff --git a/include/linux/mm.h b/include/linux/mm.h index baa8b8761d8c..f43f4de4de68 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -547,7 +547,7 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) struct mmu_gather; struct inode; -#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; @@ -1750,7 +1750,7 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif -#ifndef __HAVE_ARCH_PTE_DEVMAP +#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 01e8037023f7..2d9148221e9a 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -97,7 +97,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot) #endif #endif -#ifdef __HAVE_ARCH_PTE_DEVMAP +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { const u64 flags = PFN_DEV|PFN_MAP; @@ -115,7 +115,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd); defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pud_mkdevmap(pud_t pud); #endif -#endif /* __HAVE_ARCH_PTE_DEVMAP */ +#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) diff --git a/mm/Kconfig b/mm/Kconfig index 495d7368ced8..56cec636a1fc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. -# arch_add_memory() comprehends device memory -config ARCH_HAS_ZONE_DEVICE +config ARCH_HAS_PTE_DEVMAP bool config ZONE_DEVICE @@ -658,7 +657,7 @@ config ZONE_DEVICE depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_ZONE_DEVICE + depends on ARCH_HAS_PTE_DEVMAP select XARRAY_MULTI help diff --git a/mm/gup.c b/mm/gup.c index 8bbaa5523116..98f13ab37bac 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { -- cgit v1.2.3-55-g7522 From 090d54bcbc54af75e94442e60f42d973341a5f53 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 26 Jun 2019 16:57:09 +0800 Subject: Revert "x86/paravirt: Set up the virt_spin_lock_key after static keys get initialized" This reverts commit ca5d376e17072c1b60c3fee66f3be58ef018952d. Commit 8990cac6e5ea ("x86/jump_label: Initialize static branching early") adds jump_label_init() call in setup_arch() to make static keys initialized early, so we could use the original simpler code again. Signed-off-by: Zhenzhong Duan Reviewed-by: Thomas Gleixner Signed-off-by: Juergen Gross --- arch/x86/kernel/smpboot.c | 3 +-- arch/x86/xen/spinlock.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 259d1d2be076..fdbd47ceb84d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1368,8 +1368,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); - native_pv_lock_init(); - uv_system_init(); set_mtrr_aps_delayed_init(); @@ -1399,6 +1397,7 @@ void __init native_smp_prepare_boot_cpu(void) /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); + native_pv_lock_init(); } void __init calculate_max_logical_packages(void) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 3776122c87cc..6deb49094c60 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -68,11 +68,8 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; - if (!xen_pvspin) { - if (cpu == 0) - static_branch_disable(&virt_spin_lock_key); + if (!xen_pvspin) return; - } WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); @@ -124,6 +121,7 @@ void __init xen_init_spinlocks(void) if (!xen_pvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); + static_branch_disable(&virt_spin_lock_key); return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); -- cgit v1.2.3-55-g7522 From 1b37683cda0217305837fd1b79e7c57104d4f983 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jul 2019 20:02:08 +0800 Subject: x86/xen: Mark xen_hvm_need_lapic() and xen_x2apic_para_available() as __init .. as they are only called at early bootup stage. In fact, other functions in x86_hyper_xen_hvm.init.* are all marked as __init. Unexport xen_hvm_need_lapic as it's never used outside. Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/hypervisor.h | 6 +++--- arch/x86/xen/enlighten_hvm.c | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 39171b3646bb..42e1245af0d8 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -44,14 +44,14 @@ static inline uint32_t xen_cpuid_base(void) } #ifdef CONFIG_XEN -extern bool xen_hvm_need_lapic(void); +extern bool __init xen_hvm_need_lapic(void); -static inline bool xen_x2apic_para_available(void) +static inline bool __init xen_x2apic_para_available(void) { return xen_hvm_need_lapic(); } #else -static inline bool xen_x2apic_para_available(void) +static inline bool __init xen_x2apic_para_available(void) { return (xen_cpuid_base() != 0); } diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 0e75642d42a3..ac4943cd456a 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -218,7 +218,7 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); -bool xen_hvm_need_lapic(void) +bool __init xen_hvm_need_lapic(void) { if (xen_nopv) return false; @@ -230,7 +230,6 @@ bool xen_hvm_need_lapic(void) return false; return true; } -EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); static uint32_t __init xen_platform_hvm(void) { -- cgit v1.2.3-55-g7522 From 30978346372e5c43a652cfbd4533c6bd5427c33b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jul 2019 20:02:09 +0800 Subject: x86: Add "nopv" parameter to disable PV extensions In virtualization environment, PV extensions (drivers, interrupts, timers, etc) are enabled in the majority of use cases which is the best option. However, in some cases (kexec not fully working, benchmarking) we want to disable PV extensions. We have "xen_nopv" for that purpose but only for XEN. For a consistent admin experience a common command line parameter "nopv" set across all PV guest implementations is a better choice. There are guest types which just won't work without PV extensions, like Xen PV, Xen PVH and jailhouse. add a "ignore_nopv" member to struct hypervisor_x86 set to true for those guest types and call the detect functions only if nopv is false or ignore_nopv is true. Suggested-by: Juergen Gross Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Jan Kiszka Cc: Boris Ostrovsky Cc: Stefano Stabellini Signed-off-by: Juergen Gross --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ arch/x86/include/asm/hypervisor.h | 4 ++++ arch/x86/kernel/cpu/hypervisor.c | 11 +++++++++++ arch/x86/kernel/jailhouse.c | 1 + arch/x86/xen/enlighten_pv.c | 1 + 5 files changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 99db4975e6b5..936e8e7e6474 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5257,6 +5257,11 @@ improve timer resolution at the expense of processing more timer interrupts. + nopv= [X86,XEN,KVM,HYPER_V,VMWARE] + Disables the PV optimizations forcing the guest to run + as generic guest with no PV drivers. Currently support + XEN HVM, KVM, HYPER_V and VMWARE guest. + xirc2ps_cs= [NET,PCMCIA] Format: ,,,,,[,[,[,]]] diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 50a30f6c668b..f7b4c5338428 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -53,8 +53,12 @@ struct hypervisor_x86 { /* runtime callbacks */ struct x86_hyper_runtime runtime; + + /* ignore nopv parameter */ + bool ignore_nopv; }; +extern bool nopv; extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); static inline bool hypervisor_is_type(enum x86_hypervisor_type type) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 87e39ad8d873..7eaad41c68f4 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -58,6 +58,14 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = enum x86_hypervisor_type x86_hyper_type; EXPORT_SYMBOL(x86_hyper_type); +bool __initdata nopv; +static __init int parse_nopv(char *arg) +{ + nopv = true; + return 0; +} +early_param("nopv", parse_nopv); + static inline const struct hypervisor_x86 * __init detect_hypervisor_vendor(void) { @@ -65,6 +73,9 @@ detect_hypervisor_vendor(void) uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { + if (unlikely(nopv) && !(*p)->ignore_nopv) + continue; + pri = (*p)->detect(); if (pri > max_pri) { max_pri = pri; diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 6857b4577f17..3ad34f01de2a 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -217,4 +217,5 @@ const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { .detect = jailhouse_detect, .init.init_platform = jailhouse_init_platform, .init.x2apic_available = jailhouse_x2apic_available, + .ignore_nopv = true, }; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4722ba2966ac..5d16824e4dca 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1463,4 +1463,5 @@ const __initconst struct hypervisor_x86 x86_hyper_xen_pv = { .detect = xen_platform_pv, .type = X86_HYPER_XEN_PV, .runtime.pin_vcpu = xen_pin_vcpu, + .ignore_nopv = true, }; -- cgit v1.2.3-55-g7522 From b39b049749ce08c7756be57082177730617bb9a0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jul 2019 20:02:10 +0800 Subject: xen: Map "xen_nopv" parameter to "nopv" and mark it obsolete Clean up unnecessory code after that operation. Signed-off-by: Zhenzhong Duan Reviewed-by: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: Juergen Gross --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ arch/x86/xen/enlighten_hvm.c | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 936e8e7e6474..2d990585402c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5243,6 +5243,8 @@ xen_nopv [X86] Disables the PV optimizations forcing the HVM guest to run as generic HVM guest with no PV drivers. + This option is obsoleted by the "nopv" option, which + has equivalent effect for XEN platform. xen_scrub_pages= [XEN] Boolean option to control scrubbing pages before giving them back diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index ac4943cd456a..1756cf775ef2 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -210,18 +210,18 @@ static void __init xen_hvm_guest_init(void) #endif } -static bool xen_nopv; static __init int xen_parse_nopv(char *arg) { - xen_nopv = true; - return 0; + pr_notice("\"xen_nopv\" is deprecated, please use \"nopv\" instead\n"); + + if (xen_cpuid_base()) + nopv = true; + return 0; } early_param("xen_nopv", xen_parse_nopv); bool __init xen_hvm_need_lapic(void) { - if (xen_nopv) - return false; if (xen_pv_domain()) return false; if (!xen_hvm_domain()) @@ -233,7 +233,7 @@ bool __init xen_hvm_need_lapic(void) static uint32_t __init xen_platform_hvm(void) { - if (xen_pv_domain() || xen_nopv) + if (xen_pv_domain()) return 0; return xen_cpuid_base(); -- cgit v1.2.3-55-g7522 From cc8f3b4dd2eb859bc57187ccd94b5cd715d9cfba Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 12:26:08 +0800 Subject: x86/paravirt: Remove const mark from x86_hyper_xen_hvm variable .. as "nopv" support needs it to be changeable at boot up stage. Checkpatch reports warning, so move variable declarations from hypervisor.c to hypervisor.h Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: Juergen Gross --- arch/x86/include/asm/hypervisor.h | 8 ++++++++ arch/x86/kernel/cpu/hypervisor.c | 8 -------- arch/x86/xen/enlighten_hvm.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index f7b4c5338428..e41cbf2ec41d 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -58,6 +58,14 @@ struct hypervisor_x86 { bool ignore_nopv; }; +extern const struct hypervisor_x86 x86_hyper_vmware; +extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_xen_pv; +extern const struct hypervisor_x86 x86_hyper_kvm; +extern const struct hypervisor_x86 x86_hyper_jailhouse; +extern const struct hypervisor_x86 x86_hyper_acrn; +extern struct hypervisor_x86 x86_hyper_xen_hvm; + extern bool nopv; extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 7eaad41c68f4..553bfbfc3a1b 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -26,14 +26,6 @@ #include #include -extern const struct hypervisor_x86 x86_hyper_vmware; -extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen_pv; -extern const struct hypervisor_x86 x86_hyper_xen_hvm; -extern const struct hypervisor_x86 x86_hyper_kvm; -extern const struct hypervisor_x86 x86_hyper_jailhouse; -extern const struct hypervisor_x86 x86_hyper_acrn; - static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PV diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 1756cf775ef2..b671983cb4f5 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -259,7 +259,7 @@ static __init void xen_hvm_guest_late_init(void) #endif } -const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { +struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .name = "Xen HVM", .detect = xen_platform_hvm, .type = X86_HYPER_XEN_HVM, -- cgit v1.2.3-55-g7522 From bef6e0ae7420bfddfb150dda529bbe835f87b9f2 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 12:26:09 +0800 Subject: x86/xen: Add "nopv" support for HVM guest PVH guest needs PV extentions to work, so "nopv" parameter should be ignored for PVH but not for HVM guest. If PVH guest boots up via the Xen-PVH boot entry, xen_pvh is set early, we know it's PVH guest and ignore "nopv" parameter directly. If PVH guest boots up via the normal boot entry same as HVM guest, it's hard to distinguish PVH and HVM guest at that time. In this case, we have to panic early if PVH is detected and nopv is enabled to avoid a worse situation later. Remove static from bool_x86_init_noop/x86_op_int_noop so they could be used globally. Move xen_platform_hvm() after xen_hvm_guest_late_init() to avoid compile error. Signed-off-by: Zhenzhong Duan Reviewed-by: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: Juergen Gross --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/x86_init.c | 4 ++-- arch/x86/xen/enlighten_hvm.c | 43 +++++++++++++++++++++++++++++++++-------- 3 files changed, 39 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b85a7c54c6a1..ac0934189017 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -301,6 +301,8 @@ extern struct x86_apic_ops x86_apic_ops; extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); +extern bool bool_x86_init_noop(void); +extern void x86_op_int_noop(int cpu); extern bool x86_pnpbios_disabled(void); #endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 50a2b492fdd6..1bef687faf22 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -29,8 +29,8 @@ void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } -static bool __init bool_x86_init_noop(void) { return false; } -static void x86_op_int_noop(int cpu) { } +bool __init bool_x86_init_noop(void) { return false; } +void x86_op_int_noop(int cpu) { } /* * The platform setup functions are preset with the default functions diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index b671983cb4f5..e138f7de52d2 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -231,14 +231,6 @@ bool __init xen_hvm_need_lapic(void) return true; } -static uint32_t __init xen_platform_hvm(void) -{ - if (xen_pv_domain()) - return 0; - - return xen_cpuid_base(); -} - static __init void xen_hvm_guest_late_init(void) { #ifdef CONFIG_XEN_PVH @@ -250,6 +242,9 @@ static __init void xen_hvm_guest_late_init(void) /* PVH detected. */ xen_pvh = true; + if (nopv) + panic("\"nopv\" and \"xen_nopv\" parameters are unsupported in PVH guest."); + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ if (!nr_ioapics && acpi_irq_model == ACPI_IRQ_MODEL_PIC) acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; @@ -259,6 +254,37 @@ static __init void xen_hvm_guest_late_init(void) #endif } +static uint32_t __init xen_platform_hvm(void) +{ + uint32_t xen_domain = xen_cpuid_base(); + struct x86_hyper_init *h = &x86_hyper_xen_hvm.init; + + if (xen_pv_domain()) + return 0; + + if (xen_pvh_domain() && nopv) { + /* Guest booting via the Xen-PVH boot entry goes here */ + pr_info("\"nopv\" parameter is ignored in PVH guest\n"); + nopv = false; + } else if (nopv && xen_domain) { + /* + * Guest booting via normal boot entry (like via grub2) goes + * here. + * + * Use interface functions for bare hardware if nopv, + * xen_hvm_guest_late_init is an exception as we need to + * detect PVH and panic there. + */ + h->init_platform = x86_init_noop; + h->x2apic_available = bool_x86_init_noop; + h->init_mem_mapping = x86_init_noop; + h->init_after_bootmem = x86_init_noop; + h->guest_late_init = xen_hvm_guest_late_init; + x86_hyper_xen_hvm.runtime.pin_vcpu = x86_op_int_noop; + } + return xen_domain; +} + struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .name = "Xen HVM", .detect = xen_platform_hvm, @@ -268,4 +294,5 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .init.init_mem_mapping = xen_hvm_init_mem_mapping, .init.guest_late_init = xen_hvm_guest_late_init, .runtime.pin_vcpu = xen_pin_vcpu, + .ignore_nopv = true, }; -- cgit v1.2.3-55-g7522 From b23e5844dfe78a80ba672793187d3f52e4b528d7 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 14 Jul 2019 17:15:32 +0800 Subject: xen/pv: Fix a boot up hang revealed by int3 self test Commit 7457c0da024b ("x86/alternatives: Add int3_emulate_call() selftest") is used to ensure there is a gap setup in int3 exception stack which could be used for inserting call return address. This gap is missed in XEN PV int3 exception entry path, then below panic triggered: [ 0.772876] general protection fault: 0000 [#1] SMP NOPTI [ 0.772886] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0+ #11 [ 0.772893] RIP: e030:int3_magic+0x0/0x7 [ 0.772905] RSP: 3507:ffffffff82203e98 EFLAGS: 00000246 [ 0.773334] Call Trace: [ 0.773334] alternative_instructions+0x3d/0x12e [ 0.773334] check_bugs+0x7c9/0x887 [ 0.773334] ? __get_locked_pte+0x178/0x1f0 [ 0.773334] start_kernel+0x4ff/0x535 [ 0.773334] ? set_init_arg+0x55/0x55 [ 0.773334] xen_start_kernel+0x571/0x57a For 64bit PV guests, Xen's ABI enters the kernel with using SYSRET, with %rcx/%r11 on the stack. To convert back to "normal" looking exceptions, the xen thunks do 'xen_*: pop %rcx; pop %r11; jmp *'. E.g. Extracting 'xen_pv_trap xenint3' we have: xen_xenint3: pop %rcx; pop %r11; jmp xenint3 As xenint3 and int3 entry code are same except xenint3 doesn't generate a gap, we can fix it by using int3 and drop useless xenint3. Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/traps.h | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 1 - 4 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0ea4831a72a4..35a66fcfcb91 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1176,7 +1176,6 @@ idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN_PV idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 -idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 7d6f3f3fad78..f2bd284abc16 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,7 +40,7 @@ asmlinkage void simd_coprocessor_error(void); asmlinkage void xen_divide_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); -asmlinkage void xen_xenint3(void); +asmlinkage void xen_int3(void); asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5d16824e4dca..bed6bb93c965 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -596,12 +596,12 @@ struct trap_array_entry { static struct trap_array_entry trap_array[] = { { debug, xen_xendebug, true }, - { int3, xen_xenint3, true }, { double_fault, xen_double_fault, true }, #ifdef CONFIG_X86_MCE { machine_check, xen_machine_check, true }, #endif { nmi, xen_xennmi, true }, + { int3, xen_int3, false }, { overflow, xen_overflow, false }, #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 1e9ef0ba30a5..ebf610b49c06 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -32,7 +32,6 @@ xen_pv_trap divide_error xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 -xen_pv_trap xenint3 xen_pv_trap xennmi xen_pv_trap overflow xen_pv_trap bounds -- cgit v1.2.3-55-g7522 From 89ff7131f78ad083665382146e66430d66399076 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jul 2019 13:01:10 +0900 Subject: kbuild: add --hash-style= and --build-id unconditionally As commit 1e0221374e30 ("mips: vdso: drop unnecessary cc-ldoption") explained, these flags are supported by the minimal required version of binutils. They are supported by ld.lld too. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor --- Makefile | 6 ++---- arch/arm/vdso/Makefile | 3 +-- arch/arm64/kernel/vdso32/Makefile | 4 ++-- arch/sparc/vdso/Makefile | 3 +-- arch/x86/entry/vdso/Makefile | 5 ++--- 5 files changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/Makefile b/Makefile index 2c5d00ba537e..969182105dbd 100644 --- a/Makefile +++ b/Makefile @@ -900,10 +900,8 @@ KBUILD_CPPFLAGS += $(ARCH_CPPFLAGS) $(KCPPFLAGS) KBUILD_AFLAGS += $(ARCH_AFLAGS) $(KAFLAGS) KBUILD_CFLAGS += $(ARCH_CFLAGS) $(KCFLAGS) -# Use --build-id when available. -LDFLAGS_BUILD_ID := $(call ld-option, --build-id) -KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID) -LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID) +KBUILD_LDFLAGS_MODULE += --build-id +LDFLAGS_vmlinux += --build-id ifeq ($(CONFIG_STRIP_ASM_SYMS),y) LDFLAGS_vmlinux += $(call ld-option, -X,) diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index ca85df247775..87b7769214e0 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -13,8 +13,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING ldflags-$(CONFIG_CPU_ENDIAN_BE8) := --be8 ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ -z max-page-size=4096 -nostdlib -shared $(ldflags-y) \ - $(call ld-option, --hash-style=sysv) \ - $(call ld-option, --build-id) \ + --hash-style=sysv --build-id \ -T obj-$(CONFIG_VDSO) += vdso.o diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 288c14d30b45..60a4c6239712 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -96,8 +96,8 @@ VDSO_LDFLAGS := $(VDSO_CPPFLAGS) VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft -VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--hash-style=sysv) -VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--build-id) +VDSO_LDFLAGS += -Wl,--hash-style=sysv +VDSO_LDFLAGS += -Wl,--build-id VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd) diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 5a9e4e1f9f81..324a23947585 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -115,8 +115,7 @@ quiet_cmd_vdso = VDSO $@ -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(OBJDUMP)' '$@' -VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ - $(call ld-option, --build-id) -Bsymbolic +VDSO_LDFLAGS = -shared --hash-style=both --build-id -Bsymbolic GCOV_PROFILE := n # diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 34773395139a..8df549138193 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -176,9 +176,8 @@ quiet_cmd_vdso = VDSO $@ -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ - $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \ - -Bsymbolic +VDSO_LDFLAGS = -shared --hash-style=both --build-id \ + $(call ld-option, --eh-frame-hdr) -Bsymbolic GCOV_PROFILE := n quiet_cmd_vdso_and_check = VDSO $@ -- cgit v1.2.3-55-g7522 From 55aedddb6149ab71bec9f050846855113977b033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:55 +0200 Subject: x86/paravirt: Make read_cr2() CALLEE_SAVE The one paravirt read_cr2() implementation (Xen) is actually quite trivial and doesn't need to clobber anything other than the return register. Making read_cr2() CALLEE_SAVE avoids all the PUSH/POP nonsense and allows more convenient use from assembly. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: bp@alien8.de Cc: rostedt@goodmis.org Cc: luto@kernel.org Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114335.887392493@infradead.org --- arch/x86/entry/calling.h | 6 ++++++ arch/x86/include/asm/paravirt.h | 22 +++++++++++++--------- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/head_64.S | 4 +--- arch/x86/kernel/paravirt.c | 2 +- arch/x86/xen/enlighten_pv.c | 3 ++- arch/x86/xen/mmu_pv.c | 12 +----------- arch/x86/xen/xen-asm.S | 16 ++++++++++++++++ arch/x86/xen/xen-ops.h | 3 +++ 10 files changed, 45 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1f9e3b8230..830bd984182b 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -343,3 +343,9 @@ For 32-bit we have the following conventions - kernel is built with .Lafter_call_\@: #endif .endm + +#ifdef CONFIG_PARAVIRT_XXL +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg +#else +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg +#endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c25c38a05c1c..5135282683d4 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -116,7 +116,7 @@ static inline void write_cr0(unsigned long x) static inline unsigned long read_cr2(void) { - return PVOP_CALL0(unsigned long, mmu.read_cr2); + return PVOP_CALLEE0(unsigned long, mmu.read_cr2); } static inline void write_cr2(unsigned long x) @@ -909,13 +909,7 @@ extern void default_banner(void); ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ ) -#endif - -#define GET_CR2_INTO_RAX \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); -#ifdef CONFIG_PARAVIRT_XXL #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ ANNOTATE_RETPOLINE_SAFE; \ @@ -929,9 +923,19 @@ extern void default_banner(void); call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif -#endif +#endif /* CONFIG_PARAVIRT_XXL */ +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_PARAVIRT_XXL + +#define GET_CR2_INTO_AX \ + PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \ + ) + +#endif /* CONFIG_PARAVIRT_XXL */ -#endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 946f8f1f1efc..639b2df445ee 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -220,7 +220,7 @@ struct pv_mmu_ops { void (*exit_mmap)(struct mm_struct *mm); #ifdef CONFIG_PARAVIRT_XXL - unsigned long (*read_cr2)(void); + struct paravirt_callee_save read_cr2; void (*write_cr2)(unsigned long); unsigned long (*read_cr3)(void); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index da64452584b0..5c7ee3df4d0b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -76,6 +76,7 @@ static void __used common(void) BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); + OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); #endif BLANK(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index bcd206c8ac90..0e2d72929a8c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -29,9 +29,7 @@ #ifdef CONFIG_PARAVIRT_XXL #include #include -#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg #else -#define GET_CR2_INTO(reg) movq %cr2, reg #define INTERRUPT_RETURN iretq #endif @@ -323,7 +321,7 @@ early_idt_handler_common: cmpq $14,%rsi /* Page fault? */ jnz 10f - GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ + GET_CR2_INTO(%rdi) /* can clobber %rax if pv */ call early_make_pgtable andl %eax,%eax jz 20f /* All good */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 98039d7fb998..0aa6256eedd8 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops = { .mmu.exit_mmap = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = native_read_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), .mmu.write_cr2 = native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4722ba2966ac..26b63d051bda 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -998,7 +998,8 @@ void __init xen_setup_vcpu_info_placement(void) __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_ops.mmu.read_cr2 = xen_read_cr2_direct; + pv_ops.mmu.read_cr2 = + __PV_IS_CALLEE_SAVE(xen_read_cr2_direct); } } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f6e5eeecfc69..26e8b326966d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1307,16 +1307,6 @@ static void xen_write_cr2(unsigned long cr2) this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } -static unsigned long xen_read_cr2(void) -{ - return this_cpu_read(xen_vcpu)->arch.cr2; -} - -unsigned long xen_read_cr2_direct(void) -{ - return this_cpu_read(xen_vcpu_info.arch.cr2); -} - static noinline void xen_flush_tlb(void) { struct mmuext_op *op; @@ -2397,7 +2387,7 @@ static void xen_leave_lazy_mmu(void) } static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = xen_read_cr2, + .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 8019edd0125c..be104eef80be 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -135,3 +136,18 @@ ENTRY(check_events) FRAME_END ret ENDPROC(check_events) + +ENTRY(xen_read_cr2) + FRAME_BEGIN + _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX + _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX + FRAME_END + ret + ENDPROC(xen_read_cr2); + +ENTRY(xen_read_cr2_direct) + FRAME_BEGIN + _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX + FRAME_END + ret + ENDPROC(xen_read_cr2_direct); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2f111f47ba98..45a441c33d6d 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -134,6 +134,9 @@ __visible void xen_irq_disable_direct(void); __visible unsigned long xen_save_fl_direct(void); __visible void xen_restore_fl_direct(unsigned long); +__visible unsigned long xen_read_cr2(void); +__visible unsigned long xen_read_cr2_direct(void); + /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); -- cgit v1.2.3-55-g7522 From e67f1c11e5ea7fa47449a16325ecc997dbbf9bdf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:56 +0200 Subject: x86/entry/32: Simplify common_exception Adding one more option to SAVE_ALL can be used in common_exception to simplify things. This also saves duplication later where page_fault will no longer use common_exception. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Andy Lutomirski Cc: bp@alien8.de Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114335.945136187@infradead.org --- arch/x86/entry/entry_32.S | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 90b473297299..4d4b6100f0e8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -294,9 +294,11 @@ .Lfinished_frame_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 cld +.if \skip_gs == 0 PUSH_GS +.endif FIXUP_FRAME pushl %fs pushl %es @@ -313,13 +315,13 @@ movl %edx, %es movl $(__KERNEL_PERCPU), %edx movl %edx, %fs +.if \skip_gs == 0 SET_KERNEL_GS %edx - +.endif /* Switch to kernel stack if necessary */ .if \switch_stacks > 0 SWITCH_TO_KERNEL_STACK .endif - .endm .macro SAVE_ALL_NMI cr3_reg:req @@ -1448,32 +1450,20 @@ END(page_fault) common_exception: /* the function address is in %gs's slot on the stack */ - FIXUP_FRAME - pushl %fs - pushl %es - pushl %ds - pushl %eax - movl $(__USER_DS), %eax - movl %eax, %ds - movl %eax, %es - movl $(__KERNEL_PERCPU), %eax - movl %eax, %fs - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - SWITCH_TO_KERNEL_STACK + SAVE_ALL switch_stacks=1 skip_gs=1 ENCODE_FRAME_POINTER - cld UNWIND_ESPFIX_STACK + + /* fixup %gs */ GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx + + /* fixup orig %eax */ + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi -- cgit v1.2.3-55-g7522 From 2fd37912cfb019228bf246215938e6f7619516a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:57 +0200 Subject: x86/entry/64: Simplify idtentry a little There's a bunch of duplication in idtentry, namely the .Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow. Make this explicit by creating a idtentry_part helper macro. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Cc: bp@alien8.de Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114336.002429503@infradead.org --- arch/x86/entry/entry_64.S | 102 ++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0ea4831a72a4..3db5fede743b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -864,6 +864,52 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) +.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0 + + .if \paranoid + call paranoid_entry + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + .else + call error_entry + .endif + UNWIND_HINT_REGS + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi, %esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + .if \paranoid + /* this procedure expect "no swapgs" flag in ebx */ + jmp paranoid_exit + .else + jmp error_exit + .endif + +.endm + /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point @@ -934,47 +980,7 @@ ENTRY(\sym) .Lfrom_usermode_no_gap_\@: .endif - .if \paranoid - call paranoid_entry - .else - call error_entry - .endif - UNWIND_HINT_REGS - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif + idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset .if \paranoid == 1 /* @@ -983,21 +989,9 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ .Lfrom_usermode_switch_stack_\@: - call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ + idtentry_part \do_sym, \has_error_code, paranoid=0 .endif - call \do_sym - - jmp error_exit - .endif _ASM_NOKPROBE(\sym) END(\sym) .endm -- cgit v1.2.3-55-g7522 From 4234653e882740cbf6625eeee294e388b3176583 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:58 +0200 Subject: x86/entry/64: Update comments and sanity tests for create_gap Commit 2700fefdb2d9 ("x86_64: Add gap to int3 to allow for call emulation") forgot to update the comment, do so now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Cc: bp@alien8.de Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114336.059780563@infradead.org --- arch/x86/entry/entry_64.S | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3db5fede743b..95ae05f0edf2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -913,15 +913,16 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point - * @do_sym: C function to be called - * @has_error_code: True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from * kernel mode with user GSBASE and/or user CR3. * 2 is special -- see below. * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a + * decrement the IST stack so that nested entries get a * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) + * of recursing.) + * @create_gap: create a 6-word stack gap when coming from kernel mode. * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -951,10 +952,14 @@ ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 + .if \shift_ist != -1 && \paranoid != 1 .error "using shift_ist requires paranoid=1" .endif + .if \create_gap && \paranoid + .error "using create_gap requires paranoid=0" + .endif + ASM_CLAC .if \has_error_code == 0 -- cgit v1.2.3-55-g7522 From a0d14b8909de55139b8702fe0c7e80b69763dcfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:59 +0200 Subject: x86/mm, tracing: Fix CR2 corruption Despite the current efforts to read CR2 before tracing happens there still exist a number of possible holes: idtentry page_fault do_page_fault has_error_code=1 call error_entry TRACE_IRQS_OFF call trace_hardirqs_off* #PF // modifies CR2 CALL_enter_from_user_mode __context_tracking_exit() trace_user_exit(0) #PF // modifies CR2 call do_page_fault address = read_cr2(); /* whoopsie */ And similar for i386. Fix it by pulling the CR2 read into the entry code, before any of that stuff gets a chance to run and ruin things. Reported-by: He Zhe Reported-by: Eiichi Tsukata Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: bp@alien8.de Cc: rostedt@goodmis.org Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: joel@joelfernandes.org Link: https://lkml.kernel.org/r/20190711114336.116812491@infradead.org Debugged-by: Steven Rostedt --- arch/x86/entry/entry_32.S | 25 ++++++++++++++++++++++--- arch/x86/entry/entry_64.S | 35 ++++++++++++++++++----------------- arch/x86/include/asm/kvm_para.h | 2 +- arch/x86/include/asm/traps.h | 4 ++-- arch/x86/kernel/kvm.c | 8 ++++---- arch/x86/kernel/traps.c | 6 +----- arch/x86/mm/fault.c | 30 +++++++++++------------------- 7 files changed, 59 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4d4b6100f0e8..2bb986f305ac 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, ENTRY(page_fault) ASM_CLAC - pushl $do_page_fault - ALIGN - jmp common_exception + pushl $0; /* %gs's slot on the stack */ + + SAVE_ALL switch_stacks=1 skip_gs=1 + + ENCODE_FRAME_POINTER + UNWIND_ESPFIX_STACK + + /* fixup %gs */ + GS_TO_REG %ecx + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + + GET_CR2_INTO(%ecx) # might clobber %eax + + /* fixup orig %eax */ + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + call do_page_fault + jmp ret_from_exception END(page_fault) common_exception: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 95ae05f0edf2..7cb2e1f1ec09 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) -.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0 +.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 .if \paranoid call paranoid_entry @@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .endif UNWIND_HINT_REGS - .if \paranoid + .if \read_cr2 + GET_CR2_INTO(%rdx); /* can clobber %rax */ + .endif + .if \shift_ist != -1 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif + + .if \paranoid == 0 + testb $3, CS(%rsp) + jz .Lfrom_kernel_no_context_tracking_\@ + CALL_enter_from_user_mode +.Lfrom_kernel_no_context_tracking_\@: .endif movq %rsp, %rdi /* pt_regs pointer */ @@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * fresh stack. (This is for #DB, which has a nasty habit * of recursing.) * @create_gap: create a 6-word stack gap when coming from kernel mode. + * @read_cr2: load CR2 into the 3rd argument; done before calling any C code * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -985,7 +995,7 @@ ENTRY(\sym) .Lfrom_usermode_no_gap_\@: .endif - idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset + idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset .if \paranoid == 1 /* @@ -994,7 +1004,7 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ .Lfrom_usermode_switch_stack_\@: - idtentry_part \do_sym, \has_error_code, paranoid=0 + idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0 .endif _ASM_NOKPROBE(\sym) @@ -1006,7 +1016,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1179,10 +1189,10 @@ idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 #endif #ifdef CONFIG_X86_MCE @@ -1281,18 +1291,9 @@ ENTRY(error_entry) movq %rax, %rsp /* switch stack */ ENCODE_FRAME_POINTER pushq %r12 - - /* - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). - */ - TRACE_IRQS_OFF - CALL_enter_from_user_mode ret .Lerror_entry_done: - TRACE_IRQS_OFF ret /* diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 5ed3cf1c3934..9b4df6eaa11a 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -void do_async_page_fault(struct pt_regs *regs, unsigned long error_code); +void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init kvm_spinlock_init(void); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 7d6f3f3fad78..5dd1674ddf4c 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_64 -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code); +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address); asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); void __init trap_init(void); #endif dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); -dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code); +dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 82caf01b63dd..3231440d6253 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code) +do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { enum ctx_state prev_state; switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + do_page_fault(regs, error_code, address); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); + kvm_async_pf_task_wait((u32)address, !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - kvm_async_pf_task_wake((u32)read_cr2()); + kvm_async_pf_task_wake((u32)address); rcu_irq_exit(); break; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 87095a477154..4bb0f8447112 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_overflow(const char *message, #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; struct task_struct *tsk = current; -#ifdef CONFIG_VMAP_STACK - unsigned long cr2; -#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * stack even if the actual trigger for the double fault was * something else. */ - cr2 = read_cr2(); if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 794f364cb882..0799cc79efd3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1507,9 +1507,8 @@ good_area: NOKPROBE_SYMBOL(do_user_addr_fault); /* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. + * Explicitly marked noinline such that the function tracer sees this as the + * page_fault entry point. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, @@ -1528,33 +1527,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(__do_page_fault); -static nokprobe_inline void -trace_page_fault_entries(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static __always_inline void +trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { + if (!trace_pagefault_enabled()) + return; + if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } -/* - * We must have this function blacklisted from kprobes, tagged with notrace - * and call read_cr2() before calling anything else. To avoid calling any - * kind of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contains all sorts of tracepoints. - */ -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void +do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - if (trace_pagefault_enabled()) - trace_page_fault_entries(address, regs, error_code); - + trace_page_fault_entries(regs, error_code, address); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -- cgit v1.2.3-55-g7522 From 083db6764821996526970e42d09c1ab2f4155dd4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:36 -0500 Subject: x86/paravirt: Fix callee-saved function ELF sizes The __raw_callee_save_*() functions have an ELF symbol size of zero, which confuses objtool and other tools. Fixes a bunch of warnings like the following: arch/x86/xen/mmu_pv.o: warning: objtool: __raw_callee_save_xen_pte_val() is missing an ELF size annotation arch/x86/xen/mmu_pv.o: warning: objtool: __raw_callee_save_xen_pgd_val() is missing an ELF size annotation arch/x86/xen/mmu_pv.o: warning: objtool: __raw_callee_save_xen_make_pte() is missing an ELF size annotation arch/x86/xen/mmu_pv.o: warning: objtool: __raw_callee_save_xen_make_pgd() is missing an ELF size annotation Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/afa6d49bb07497ca62e4fc3b27a2d0cece545b4e.1563413318.git.jpoimboe@redhat.com --- arch/x86/include/asm/paravirt.h | 1 + arch/x86/kernel/kvm.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c25c38a05c1c..d6f5ae2c79ab 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -746,6 +746,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ "ret;" \ + ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") /* Get a reference to a callee-save function */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 82caf01b63dd..6661bd2f08a6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -838,6 +838,7 @@ asm( "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" "setne %al;" "ret;" +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" ".popsection"); #endif -- cgit v1.2.3-55-g7522 From d99a6ce70ec6ed990b74bd4e34232fd830d20d27 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:37 -0500 Subject: x86/kvm: Fix fastop function ELF metadata Some of the fastop functions, e.g. em_setcc(), are actually just used as global labels which point to blocks of functions. The global labels are incorrectly annotated as functions. Also the functions themselves don't have size annotations. Fixes a bunch of warnings like the following: arch/x86/kvm/emulate.o: warning: objtool: seto() is missing an ELF size annotation arch/x86/kvm/emulate.o: warning: objtool: em_setcc() is missing an ELF size annotation arch/x86/kvm/emulate.o: warning: objtool: setno() is missing an ELF size annotation arch/x86/kvm/emulate.o: warning: objtool: setc() is missing an ELF size annotation Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/c8cc9be60ebbceb3092aa5dd91916039a1f88275.1563413318.git.jpoimboe@redhat.com --- arch/x86/kvm/emulate.c | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8e409ad448f9..718f7d9afedc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -312,29 +312,42 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); -#define FOP_FUNC(name) \ +#define __FOP_FUNC(name) \ ".align " __stringify(FASTOP_SIZE) " \n\t" \ ".type " name ", @function \n\t" \ name ":\n\t" -#define FOP_RET "ret \n\t" +#define FOP_FUNC(name) \ + __FOP_FUNC(#name) + +#define __FOP_RET(name) \ + "ret \n\t" \ + ".size " name ", .-" name "\n\t" + +#define FOP_RET(name) \ + __FOP_RET(#name) #define FOP_START(op) \ extern void em_##op(struct fastop *fake); \ asm(".pushsection .text, \"ax\" \n\t" \ ".global em_" #op " \n\t" \ - FOP_FUNC("em_" #op) + ".align " __stringify(FASTOP_SIZE) " \n\t" \ + "em_" #op ":\n\t" #define FOP_END \ ".popsection") +#define __FOPNOP(name) \ + __FOP_FUNC(name) \ + __FOP_RET(name) + #define FOPNOP() \ - FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \ - FOP_RET + __FOPNOP(__stringify(__UNIQUE_ID(nop))) #define FOP1E(op, dst) \ - FOP_FUNC(#op "_" #dst) \ - "10: " #op " %" #dst " \n\t" FOP_RET + __FOP_FUNC(#op "_" #dst) \ + "10: " #op " %" #dst " \n\t" \ + __FOP_RET(#op "_" #dst) #define FOP1EEX(op, dst) \ FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) @@ -366,8 +379,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP2E(op, dst, src) \ - FOP_FUNC(#op "_" #dst "_" #src) \ - #op " %" #src ", %" #dst " \n\t" FOP_RET + __FOP_FUNC(#op "_" #dst "_" #src) \ + #op " %" #src ", %" #dst " \n\t" \ + __FOP_RET(#op "_" #dst "_" #src) #define FASTOP2(op) \ FOP_START(op) \ @@ -405,8 +419,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP3E(op, dst, src, src2) \ - FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \ - #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + __FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \ + #op " %" #src2 ", %" #src ", %" #dst " \n\t"\ + __FOP_RET(#op "_" #dst "_" #src "_" #src2) /* 3-operand, word-only, src2=cl */ #define FASTOP3WCL(op) \ @@ -423,7 +438,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ".type " #op ", @function \n\t" \ #op ": \n\t" \ #op " %al \n\t" \ - FOP_RET + __FOP_RET(#op) asm(".pushsection .fixup, \"ax\"\n" ".global kvm_fastop_exception \n" @@ -449,7 +464,10 @@ FOP_SETCC(setle) FOP_SETCC(setnle) FOP_END; -FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET +FOP_START(salc) +FOP_FUNC(salc) +"pushf; sbb %al, %al; popf \n\t" +FOP_RET(salc) FOP_END; /* -- cgit v1.2.3-55-g7522 From 19f2d8fa98644c7b78845b1d66abeae4e3d9dfa8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:38 -0500 Subject: x86/kvm: Replace vmx_vmenter()'s call to kvm_spurious_fault() with UD2 Objtool reports the following: arch/x86/kvm/vmx/vmenter.o: warning: objtool: vmx_vmenter()+0x14: call without frame pointer save/setup But frame pointers are necessarily broken anyway, because __vmx_vcpu_run() clobbers RBP with the guest's value before calling vmx_vmenter(). So calling without a frame pointer doesn't make things any worse. Make objtool happy by changing the call to a UD2. Suggested-by: Paolo Bonzini Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Link: https://lkml.kernel.org/r/9fc2216c9dc972f95bb65ce2966a682c6bda1cb0.1563413318.git.jpoimboe@redhat.com --- arch/x86/kvm/vmx/vmenter.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index d4cb1945b2e3..4010d519eb8c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -54,9 +54,9 @@ ENTRY(vmx_vmenter) ret 3: cmpb $0, kvm_rebooting - jne 4f - call kvm_spurious_fault -4: ret + je 4f + ret +4: ud2 .pushsection .fixup, "ax" 5: jmp 3b -- cgit v1.2.3-55-g7522 From 3901336ed9887b075531bffaeef7742ba614058b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:39 -0500 Subject: x86/kvm: Don't call kvm_spurious_fault() from .fixup After making a change to improve objtool's sibling call detection, it started showing the following warning: arch/x86/kvm/vmx/nested.o: warning: objtool: .fixup+0x15: sibling call from callable instruction with modified stack frame The problem is the ____kvm_handle_fault_on_reboot() macro. It does a fake call by pushing a fake RIP and doing a jump. That tricks the unwinder into printing the function which triggered the exception, rather than the .fixup code. Instead of the hack to make it look like the original function made the call, just change the macro so that the original function actually does make the call. This allows removal of the hack, and also makes objtool happy. I triggered a vmx instruction exception and verified that the stack trace is still sane: kernel BUG at arch/x86/kvm/x86.c:358! invalid opcode: 0000 [#1] SMP PTI CPU: 28 PID: 4096 Comm: qemu-kvm Not tainted 5.2.0+ #16 Hardware name: Lenovo THINKSYSTEM SD530 -[7X2106Z000]-/-[7X2106Z000]-, BIOS -[TEE113Z-1.00]- 07/17/2017 RIP: 0010:kvm_spurious_fault+0x5/0x10 Code: 00 00 00 00 00 8b 44 24 10 89 d2 45 89 c9 48 89 44 24 10 8b 44 24 08 48 89 44 24 08 e9 d4 40 22 00 0f 1f 40 00 0f 1f 44 00 00 <0f> 0b 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 55 49 89 fd 41 RSP: 0018:ffffbf91c683bd00 EFLAGS: 00010246 RAX: 000061f040000000 RBX: ffff9e159c77bba0 RCX: ffff9e15a5c87000 RDX: 0000000665c87000 RSI: ffff9e15a5c87000 RDI: ffff9e159c77bba0 RBP: 0000000000000000 R08: 0000000000000000 R09: ffff9e15a5c87000 R10: 0000000000000000 R11: fffff8f2d99721c0 R12: ffff9e159c77bba0 R13: ffffbf91c671d960 R14: ffff9e159c778000 R15: 0000000000000000 FS: 00007fa341cbe700(0000) GS:ffff9e15b7400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fdd38356804 CR3: 00000006759de003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: loaded_vmcs_init+0x4f/0xe0 alloc_loaded_vmcs+0x38/0xd0 vmx_create_vcpu+0xf7/0x600 kvm_vm_ioctl+0x5e9/0x980 ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 ? free_one_page+0x13f/0x4e0 do_vfs_ioctl+0xa4/0x630 ksys_ioctl+0x60/0x90 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x55/0x1c0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fa349b1ee5b Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/64a9b64d127e87b6920a97afde8e96ea76f6524e.1563413318.git.jpoimboe@redhat.com --- arch/x86/include/asm/kvm_host.h | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0cc5b611a113..8282b8d41209 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1496,25 +1496,29 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +asmlinkage void __noreturn kvm_spurious_fault(void); + /* * Hardware virtualization extension instructions may fault if a * reboot turns off virtualization while processes are running. - * Trap the fault and ignore the instruction if that happens. + * Usually after catching the fault we just panic; during reboot + * instead the instruction is ignored. */ -asmlinkage void kvm_spurious_fault(void); - -#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ - "666: " insn "\n\t" \ - "668: \n\t" \ - ".pushsection .fixup, \"ax\" \n" \ - "667: \n\t" \ - cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ - "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t" \ - "jmp kvm_spurious_fault \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(666b, 667b) +#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ + "666: \n\t" \ + insn "\n\t" \ + "jmp 668f \n\t" \ + "667: \n\t" \ + "call kvm_spurious_fault \n\t" \ + "668: \n\t" \ + ".pushsection .fixup, \"ax\" \n\t" \ + "700: \n\t" \ + cleanup_insn "\n\t" \ + "cmpb $0, kvm_rebooting\n\t" \ + "je 667b \n\t" \ + "jmp 668b \n\t" \ + ".popsection \n\t" \ + _ASM_EXTABLE(666b, 700b) #define __kvm_handle_fault_on_reboot(insn) \ ____kvm_handle_fault_on_reboot(insn, "") -- cgit v1.2.3-55-g7522 From e6dd47394493061c605285a868fc72eae2e9c866 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:40 -0500 Subject: x86/entry: Fix thunk function ELF sizes Fix the following warnings: arch/x86/entry/thunk_64.o: warning: objtool: trace_hardirqs_on_thunk() is missing an ELF size annotation arch/x86/entry/thunk_64.o: warning: objtool: trace_hardirqs_off_thunk() is missing an ELF size annotation arch/x86/entry/thunk_64.o: warning: objtool: lockdep_sys_exit_thunk() is missing an ELF size annotation Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/89c97adc9f6cc44a0f5d03cde6d0357662938909.1563413318.git.jpoimboe@redhat.com --- arch/x86/entry/thunk_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index cfdca8b42c70..cc20465b2867 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -12,9 +12,7 @@ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 - .globl \name - .type \name, @function -\name: + ENTRY(\name) pushq %rbp movq %rsp, %rbp @@ -35,6 +33,7 @@ call \func jmp .L_restore + ENDPROC(\name) _ASM_NOKPROBE(\name) .endm -- cgit v1.2.3-55-g7522 From 61a73f5cd1a5794626d216cc56e20a1b195c5d0c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:41 -0500 Subject: x86/head/64: Annotate start_cpu0() as non-callable After an objtool improvement, it complains about the fact that start_cpu0() jumps to code which has an LRET instruction. arch/x86/kernel/head_64.o: warning: objtool: .head.text+0xe4: unsupported instruction in callable function Technically, start_cpu0() is callable, but it acts nothing like a callable function. Prevent objtool from treating it like one by removing its ELF function annotation. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/6b1b4505fcb90571a55fa1b52d71fb458ca24454.1563413318.git.jpoimboe@redhat.com --- arch/x86/kernel/head_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index bcd206c8ac90..66b4a7757397 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -253,10 +253,10 @@ END(secondary_startup_64) * start_secondary() via .Ljump_to_C_code. */ ENTRY(start_cpu0) - movq initial_stack(%rip), %rsp UNWIND_HINT_EMPTY + movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code -ENDPROC(start_cpu0) +END(start_cpu0) #endif /* Both SMP bootup and ACPI suspend change these variables */ -- cgit v1.2.3-55-g7522 From 3a6ab4bcc52263dd5b1d2fd2e4ce95a38c798b4d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:42 -0500 Subject: x86/uaccess: Remove ELF function annotation from copy_user_handle_tail() After an objtool improvement, it's complaining about the CLAC in copy_user_handle_tail(): arch/x86/lib/copy_user_64.o: warning: objtool: .altinstr_replacement+0x12: redundant UACCESS disable arch/x86/lib/copy_user_64.o: warning: objtool: copy_user_handle_tail()+0x6: (alt) arch/x86/lib/copy_user_64.o: warning: objtool: copy_user_handle_tail()+0x2: (alt) arch/x86/lib/copy_user_64.o: warning: objtool: copy_user_handle_tail()+0x0: <=== (func) copy_user_handle_tail() is incorrectly marked as a callable function, so objtool is rightfully concerned about the CLAC with no corresponding STAC. Remove the ELF function annotation. The copy_user_handle_tail() code path is already verified by objtool because it's jumped to by other callable asm code (which does the corresponding STAC). Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/6b6e436774678b4b9873811ff023bd29935bee5b.1563413318.git.jpoimboe@redhat.com --- arch/x86/lib/copy_user_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 378a1f70ae7d..4fe1601dbc5d 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -239,7 +239,7 @@ copy_user_handle_tail: ret _ASM_EXTABLE_UA(1b, 2b) -ENDPROC(copy_user_handle_tail) +END(copy_user_handle_tail) /* * copy_user_nocache - Uncached memory copy with exception handling -- cgit v1.2.3-55-g7522 From 5e307a6bc7b600999742675dd182bcb8a6fe308e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:43 -0500 Subject: x86/uaccess: Don't leak AC flag into fentry from mcsafe_handle_tail() After adding mcsafe_handle_tail() to the objtool uaccess safe list, objtool reports: arch/x86/lib/usercopy_64.o: warning: objtool: mcsafe_handle_tail()+0x0: call to __fentry__() with UACCESS enabled With SMAP, this function is called with AC=1, so it needs to be careful about which functions it calls. Disable the ftrace entry hook, which can potentially pull in a lot of extra code. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/8e13d6f0da1c8a3f7603903da6cbf6d582bbfe10.1563413318.git.jpoimboe@redhat.com --- arch/x86/lib/usercopy_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index e0e006f1624e..fff28c6f73a2 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -60,7 +60,7 @@ EXPORT_SYMBOL(clear_user); * but reuse __memcpy_mcsafe in case a new read error is encountered. * clac() is handled in _copy_to_iter_mcsafe(). */ -__visible unsigned long +__visible notrace unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) { -- cgit v1.2.3-55-g7522 From 82e844a6536d1a3c12a73e44712f4021d90a4b53 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:44 -0500 Subject: x86/uaccess: Remove redundant CLACs in getuser/putuser error paths The same getuser/putuser error paths are used regardless of whether AC is set. In non-exception failure cases, this results in an unnecessary CLAC. Fixes the following warnings: arch/x86/lib/getuser.o: warning: objtool: .altinstr_replacement+0x18: redundant UACCESS disable arch/x86/lib/putuser.o: warning: objtool: .altinstr_replacement+0x18: redundant UACCESS disable Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/bc14ded2755ae75bd9010c446079e113dbddb74b.1563413318.git.jpoimboe@redhat.com --- arch/x86/lib/getuser.S | 20 ++++++++++---------- arch/x86/lib/putuser.S | 29 ++++++++++++++++------------- 2 files changed, 26 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 74fdff968ea3..304f958c27b2 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -115,29 +115,29 @@ ENDPROC(__get_user_8) EXPORT_SYMBOL(__get_user_8) +bad_get_user_clac: + ASM_CLAC bad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX - ASM_CLAC ret -END(bad_get_user) #ifdef CONFIG_X86_32 +bad_get_user_8_clac: + ASM_CLAC bad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX - ASM_CLAC ret -END(bad_get_user_8) #endif - _ASM_EXTABLE_UA(1b, bad_get_user) - _ASM_EXTABLE_UA(2b, bad_get_user) - _ASM_EXTABLE_UA(3b, bad_get_user) + _ASM_EXTABLE_UA(1b, bad_get_user_clac) + _ASM_EXTABLE_UA(2b, bad_get_user_clac) + _ASM_EXTABLE_UA(3b, bad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(4b, bad_get_user) + _ASM_EXTABLE_UA(4b, bad_get_user_clac) #else - _ASM_EXTABLE_UA(4b, bad_get_user_8) - _ASM_EXTABLE_UA(5b, bad_get_user_8) + _ASM_EXTABLE_UA(4b, bad_get_user_8_clac) + _ASM_EXTABLE_UA(5b, bad_get_user_8_clac) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index d2e5c9c39601..14bf78341d3c 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -32,8 +32,6 @@ */ #define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX -#define EXIT ASM_CLAC ; \ - ret .text ENTRY(__put_user_1) @@ -43,7 +41,8 @@ ENTRY(__put_user_1) ASM_STAC 1: movb %al,(%_ASM_CX) xor %eax,%eax - EXIT + ASM_CLAC + ret ENDPROC(__put_user_1) EXPORT_SYMBOL(__put_user_1) @@ -56,7 +55,8 @@ ENTRY(__put_user_2) ASM_STAC 2: movw %ax,(%_ASM_CX) xor %eax,%eax - EXIT + ASM_CLAC + ret ENDPROC(__put_user_2) EXPORT_SYMBOL(__put_user_2) @@ -69,7 +69,8 @@ ENTRY(__put_user_4) ASM_STAC 3: movl %eax,(%_ASM_CX) xor %eax,%eax - EXIT + ASM_CLAC + ret ENDPROC(__put_user_4) EXPORT_SYMBOL(__put_user_4) @@ -85,19 +86,21 @@ ENTRY(__put_user_8) 5: movl %edx,4(%_ASM_CX) #endif xor %eax,%eax - EXIT + ASM_CLAC + RET ENDPROC(__put_user_8) EXPORT_SYMBOL(__put_user_8) +bad_put_user_clac: + ASM_CLAC bad_put_user: movl $-EFAULT,%eax - EXIT -END(bad_put_user) + RET - _ASM_EXTABLE_UA(1b, bad_put_user) - _ASM_EXTABLE_UA(2b, bad_put_user) - _ASM_EXTABLE_UA(3b, bad_put_user) - _ASM_EXTABLE_UA(4b, bad_put_user) + _ASM_EXTABLE_UA(1b, bad_put_user_clac) + _ASM_EXTABLE_UA(2b, bad_put_user_clac) + _ASM_EXTABLE_UA(3b, bad_put_user_clac) + _ASM_EXTABLE_UA(4b, bad_put_user_clac) #ifdef CONFIG_X86_32 - _ASM_EXTABLE_UA(5b, bad_put_user) + _ASM_EXTABLE_UA(5b, bad_put_user_clac) #endif -- cgit v1.2.3-55-g7522 From cd6697b8b8751b65abd7859af55cf06f36b8e716 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 21:15:57 +0800 Subject: x86/boot/efi: Remove unused variables Fix gcc warnings: arch/x86/boot/compressed/eboot.c: In function 'make_boot_params': arch/x86/boot/compressed/eboot.c:394:6: warning: unused variable 'i' [-Wunused-variable] int i; ^ arch/x86/boot/compressed/eboot.c:393:6: warning: unused variable 's1' [-Wunused-variable] u8 *s1; ^ arch/x86/boot/compressed/eboot.c:392:7: warning: unused variable 's2' [-Wunused-variable] u16 *s2; ^ arch/x86/boot/compressed/eboot.c:387:8: warning: unused variable 'options' [-Wunused-variable] void *options, *handle; ^ arch/x86/boot/compressed/eboot.c: In function 'add_e820ext': arch/x86/boot/compressed/eboot.c:498:16: warning: unused variable 'size' [-Wunused-variable] unsigned long size; ^ arch/x86/boot/compressed/eboot.c:497:15: warning: unused variable 'status' [-Wunused-variable] efi_status_t status; ^ arch/x86/boot/compressed/eboot.c: In function 'exit_boot_func': arch/x86/boot/compressed/eboot.c:681:15: warning: unused variable 'status' [-Wunused-variable] efi_status_t status; ^ arch/x86/boot/compressed/eboot.c:680:8: warning: unused variable 'nr_desc' [-Wunused-variable] __u32 nr_desc; ^ arch/x86/boot/compressed/eboot.c: In function 'efi_main': arch/x86/boot/compressed/eboot.c:750:22: warning: unused variable 'image' [-Wunused-variable] efi_loaded_image_t *image; ^ Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1563282957-26898-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/boot/compressed/eboot.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 220d1279d0e2..d6662fdef300 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -384,14 +384,11 @@ struct boot_params *make_boot_params(struct efi_config *c) struct apm_bios_info *bi; struct setup_header *hdr; efi_loaded_image_t *image; - void *options, *handle; + void *handle; efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; int options_size = 0; efi_status_t status; char *cmdline_ptr; - u16 *s2; - u8 *s1; - int i; unsigned long ramdisk_addr; unsigned long ramdisk_size; @@ -494,8 +491,6 @@ static void add_e820ext(struct boot_params *params, struct setup_data *e820ext, u32 nr_entries) { struct setup_data *data; - efi_status_t status; - unsigned long size; e820ext->type = SETUP_E820_EXT; e820ext->len = nr_entries * sizeof(struct boot_e820_entry); @@ -677,8 +672,6 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, void *priv) { const char *signature; - __u32 nr_desc; - efi_status_t status; struct exit_boot_struct *p = priv; signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE @@ -747,7 +740,6 @@ struct boot_params * efi_main(struct efi_config *c, struct boot_params *boot_params) { struct desc_ptr *gdt = NULL; - efi_loaded_image_t *image; struct setup_header *hdr = &boot_params->hdr; efi_status_t status; struct desc_struct *desc; -- cgit v1.2.3-55-g7522 From 449f328637e3ca62461da04d60ccb35aa5aa21dc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 21:17:20 +0800 Subject: x86/boot/compressed/64: Remove unused variable Fix gcc warning: arch/x86/boot/compressed/pgtable_64.c: In function 'find_trampoline_placement': arch/x86/boot/compressed/pgtable_64.c:43:16: warning: unused variable 'trampoline_start' [-Wunused-variable] unsigned long trampoline_start; ^ Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Link: https://lkml.kernel.org/r/1563283040-31101-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/boot/compressed/pgtable_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index f8debf7aeb4c..5f2d03067ae5 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -40,7 +40,6 @@ int cmdline_find_option_bool(const char *option); static unsigned long find_trampoline_placement(void) { unsigned long bios_start = 0, ebda_start = 0; - unsigned long trampoline_start; struct boot_e820_entry *entry; char *signature; int i; -- cgit v1.2.3-55-g7522 From 8c5477e8046ca139bac250386c08453da37ec1ae Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 21:18:12 +0800 Subject: x86, boot: Remove multiple copy of static function sanitize_boot_params() Kernel build warns: 'sanitize_boot_params' defined but not used [-Wunused-function] at below files: arch/x86/boot/compressed/cmdline.c arch/x86/boot/compressed/error.c arch/x86/boot/compressed/early_serial_console.c arch/x86/boot/compressed/acpi.c That's becausethey each include misc.h which includes a definition of sanitize_boot_params() via bootparam_utils.h. Remove the inclusion from misc.h and have the c file including bootparam_utils.h directly. Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1563283092-1189-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/boot/compressed/misc.c | 1 + arch/x86/boot/compressed/misc.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 24e65a0f756d..53ac0cb2396d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -17,6 +17,7 @@ #include "pgtable.h" #include "../string.h" #include "../voffset.h" +#include /* * WARNING!! diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index d2f184165934..c8181392f70d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,7 +23,6 @@ #include #include #include -#include #define BOOT_CTYPE_H #include -- cgit v1.2.3-55-g7522 From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:51 -0700 Subject: mm/memory_hotplug: allow arch_remove_memory() without CONFIG_MEMORY_HOTREMOVE We want to improve error handling while adding memory by allowing to use arch_remove_memory() and __remove_pages() even if CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like: arch_add_memory() rc = do_something(); if (rc) { arch_remove_memory(); } We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require quite some dependencies for memory offlining. Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Tatashin Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: "Kirill A. Shutemov" Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: Christophe Leroy Cc: Nicholas Piggin Cc: Vasily Gorbik Cc: Rob Herring Cc: Masahiro Yamada Cc: "mike.travis@hpe.com" Cc: Andrew Banman Cc: Arun KS Cc: Qian Cai Cc: Mathieu Malaterre Cc: Baoquan He Cc: Logan Gunthorpe Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Chintan Pandya Cc: Dan Williams Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: Mark Rutland Cc: Mike Rapoport Cc: Oscar Salvador Cc: Robin Murphy Cc: Wei Yang Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 2 -- arch/ia64/mm/init.c | 2 -- arch/powerpc/mm/mem.c | 2 -- arch/s390/mm/init.c | 2 -- arch/sh/mm/init.c | 2 -- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- drivers/base/memory.c | 2 -- include/linux/memory.h | 2 -- include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 2 -- mm/sparse.c | 6 ------ 12 files changed, 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a21fa7e1167d..750a69dde39b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1074,7 +1074,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -1093,4 +1092,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d28e29103bdb..aae75fd7b810 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 26a8da3723bb..9259337d7374 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -125,7 +125,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -151,7 +150,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, pr_warn("Hash collision while resizing HPT\n"); } #endif -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 5b1ec2f532e0..4e5bbe328594 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -286,7 +286,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -298,5 +297,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); vmem_remove_mapping(start, size); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 13c6a6bb5fd9..dfdbaa50946e 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -429,7 +429,6 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -440,5 +439,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, zone = page_zone(pfn_to_page(start_pfn)); __remove_pages(zone, start_pfn, nr_pages, altmap); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f265a4316179..4068abb9427f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,7 +860,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -872,7 +871,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif int kernel_set_to_readonly __read_mostly; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 08bbf648827b..5a289a2ab108 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1198,7 +1198,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, remove_pagetable(start, end, false, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void __meminit kernel_physical_mapping_remove(unsigned long start, unsigned long end) { @@ -1219,7 +1218,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e0aa7f9abb36..92459d6f12be 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -723,7 +723,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE static void unregister_memory(struct memory_block *memory) { @@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section) out_unlock: mutex_unlock(&mem_sysfs_mutex); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* return true if the memory block is offlined, otherwise, return false */ bool is_memblock_offlined(struct memory_block *mem) diff --git a/include/linux/memory.h b/include/linux/memory.h index e1dc1bb2b787..474c7c60c8f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,9 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int hotplug_memory_register(int nid, struct mem_section *section); -#ifdef CONFIG_MEMORY_HOTREMOVE extern void unregister_memory_section(struct mem_section *); -#endif extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 988fde33cd7f..87bf9c4a889e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,12 +123,10 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -#ifdef CONFIG_MEMORY_HOTREMOVE extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* * Do we want sysfs memblock files created. This will allow userspace to online diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a8c25fd85ee3..bc11888d5d7e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -318,7 +318,6 @@ out: return err; } -#ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, @@ -580,7 +579,6 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, set_zone_contiguous(zone); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { diff --git a/mm/sparse.c b/mm/sparse.c index fd13166949b5..d1d5e05f5b8d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -604,7 +604,6 @@ static void __kfree_section_memmap(struct page *memmap, vmemmap_free(start, end, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; @@ -612,7 +611,6 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #else static struct page *__kmalloc_section_memmap(void) { @@ -651,7 +649,6 @@ static void __kfree_section_memmap(struct page *memmap, get_order(sizeof(struct page) * PAGES_PER_SECTION)); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; @@ -681,7 +678,6 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ /** @@ -746,7 +742,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -823,5 +818,4 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap, altmap); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3-55-g7522 From e9c0a3f05477e18d2dae816cb61b62be1b7e90d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:11 -0700 Subject: mm/sparsemem: convert kmalloc_section_memmap() to populate_section_memmap() Allow sub-section sized ranges to be added to the memmap. populate_section_memmap() takes an explict pfn range rather than assuming a full section, and those parameters are plumbed all the way through to vmmemap_populate(). There should be no sub-section usage in current deployments. New warnings are added to clarify which memmap allocation paths are sub-section capable. Link: http://lkml.kernel.org/r/156092352058.979959.6551283472062305149.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Pavel Tatashin Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Logan Gunthorpe Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 4 +++- include/linux/mm.h | 4 ++-- mm/sparse-vmemmap.c | 21 ++++++++++++++------- mm/sparse.c | 50 +++++++++++++++++++++++++++----------------------- 4 files changed, 46 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a289a2ab108..a6b5c653727b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1518,7 +1518,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, { int err; - if (boot_cpu_has(X86_FEATURE_PSE)) + if (end - start < PAGES_PER_SECTION * sizeof(struct page)) + err = vmemmap_populate_basepages(start, end, node); + else if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", diff --git a/include/linux/mm.h b/include/linux/mm.h index 48ab7b982d82..0334ca97c584 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,8 +2767,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) #endif void *sparse_buffer_alloc(unsigned long size); -struct page *sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap); +struct page * __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 7fec05796796..200aef686722 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start, return 0; } -struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +struct page * __meminit __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { unsigned long start; unsigned long end; - struct page *map; - map = pfn_to_page(pnum * PAGES_PER_SECTION); - start = (unsigned long)map; - end = (unsigned long)(map + PAGES_PER_SECTION); + /* + * The minimum granularity of memmap extensions is + * PAGES_PER_SUBSECTION as allocations are tracked in the + * 'subsection_map' bitmap of the section. + */ + end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION); + pfn &= PAGE_SUBSECTION_MASK; + nr_pages = end - pfn; + + start = (unsigned long) pfn_to_page(pfn); + end = start + nr_pages * sizeof(struct page); if (vmemmap_populate(start, end, nid, altmap)) return NULL; - return map; + return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 26b48ee1a262..6b01022e23a9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -439,8 +439,8 @@ static unsigned long __init section_map_size(void) return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } -struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +struct page __init *__populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { unsigned long size = section_map_size(); struct page *map = sparse_buffer_alloc(size); @@ -521,10 +521,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, } sparse_buffer_init(map_count * section_map_size(), nid); for_each_present_section_nr(pnum_begin, pnum) { + unsigned long pfn = section_nr_to_pfn(pnum); + if (pnum >= pnum_end) break; - map = sparse_mem_map_populate(pnum, nid, NULL); + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, + nid, NULL); if (!map) { pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); @@ -625,17 +628,17 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +static struct page *populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { - /* This will make the necessary allocations eventually. */ - return sparse_mem_map_populate(pnum, nid, altmap); + return __populate_section_memmap(pfn, nr_pages, nid, altmap); } -static void __kfree_section_memmap(struct page *memmap, + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); vmemmap_free(start, end, altmap); } @@ -647,7 +650,8 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } #else -static struct page *__kmalloc_section_memmap(void) +struct page *populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { struct page *page, *ret; unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; @@ -668,15 +672,11 @@ got_map_ptr: return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - return __kmalloc_section_memmap(); -} + struct page *memmap = pfn_to_page(pfn); -static void __kfree_section_memmap(struct page *memmap, - struct vmem_altmap *altmap) -{ if (is_vmalloc_addr(memmap)) vfree(memmap); else @@ -745,12 +745,13 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, if (ret < 0 && ret != -EEXIST) return ret; ret = 0; - memmap = kmalloc_section_memmap(section_nr, nid, altmap); + memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid, + altmap); if (!memmap) return -ENOMEM; usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); if (!usage) { - __kfree_section_memmap(memmap, altmap); + depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); return -ENOMEM; } @@ -773,7 +774,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, out: if (ret < 0) { kfree(usage); - __kfree_section_memmap(memmap, altmap); + depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); } return ret; } @@ -809,7 +810,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) #endif static void free_section_usage(struct mem_section *ms, struct page *memmap, - struct mem_section_usage *usage, struct vmem_altmap *altmap) + struct mem_section_usage *usage, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { if (!usage) return; @@ -820,7 +822,7 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap, if (!early_section(ms)) { kfree(usage); if (memmap) - __kfree_section_memmap(memmap, altmap); + depopulate_section_memmap(pfn, nr_pages, altmap); return; } @@ -849,6 +851,8 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, clear_hwpoisoned_pages(memmap + map_offset, PAGES_PER_SECTION - map_offset); - free_section_usage(ms, memmap, usage, altmap); + free_section_usage(ms, memmap, usage, + section_nr_to_pfn(__section_nr(ms)), + PAGES_PER_SECTION, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3-55-g7522 From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 18 Jul 2019 15:58:50 -0700 Subject: proc/sysctl: add shared variables for range check In the sysctl code the proc_dointvec_minmax() function is often used to validate the user supplied value between an allowed range. This function uses the extra1 and extra2 members from struct ctl_table as minimum and maximum allowed value. On sysctl handler declaration, in every source file there are some readonly variables containing just an integer which address is assigned to the extra1 and extra2 members, so the sysctl range is enforced. The special values 0, 1 and INT_MAX are very often used as range boundary, leading duplication of variables like zero=0, one=1, int_max=INT_MAX in different source files: $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l 248 Add a const int array containing the most commonly used values, some macros to refer more easily to the correct array member, and use them instead of creating a local one for every object file. This is the bloat-o-meter output comparing the old and new binary compiled with the default Fedora config: # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164) Data old new delta sysctl_vals - 12 +12 __kstrtab_sysctl_vals - 12 +12 max 14 10 -4 int_max 16 - -16 one 68 - -68 zero 128 28 -100 Total: Before=20583249, After=20583085, chg -0.00% [mcroce@redhat.com: tipc: remove two unused variables] Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com [akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c] [arnd@arndb.de: proc/sysctl: make firmware loader table conditional] Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de [akpm@linux-foundation.org: fix fs/eventpoll.c] Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com Signed-off-by: Matteo Croce Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Reviewed-by: Aaron Tomlin Cc: Matthew Wilcox Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/appldata/appldata_base.c | 15 +- arch/s390/kernel/topology.c | 6 +- arch/x86/entry/vdso/vdso32-setup.c | 7 +- arch/x86/kernel/itmt.c | 6 +- drivers/base/firmware_loader/fallback_table.c | 13 +- drivers/gpu/drm/i915/i915_perf.c | 8 +- drivers/hv/vmbus_drv.c | 6 +- drivers/tty/tty_ldisc.c | 6 +- drivers/xen/balloon.c | 7 +- fs/eventpoll.c | 4 +- fs/notify/inotify/inotify_user.c | 8 +- fs/proc/proc_sysctl.c | 4 + include/linux/sysctl.h | 7 + ipc/ipc_sysctl.c | 35 +++-- kernel/pid_namespace.c | 3 +- kernel/sysctl.c | 197 +++++++++++++------------- kernel/ucount.c | 6 +- net/core/neighbour.c | 20 ++- net/core/sysctl_net_core.c | 34 +++-- net/dccp/sysctl.c | 16 +-- net/ipv4/sysctl_net_ipv4.c | 60 ++++---- net/ipv6/addrconf.c | 6 +- net/ipv6/route.c | 7 +- net/ipv6/sysctl_net_ipv6.c | 10 +- net/mpls/af_mpls.c | 10 +- net/netfilter/ipvs/ip_vs_ctl.c | 3 +- net/rxrpc/sysctl.c | 9 +- net/sctp/sysctl.c | 35 +++-- net/sunrpc/xprtrdma/transport.c | 3 +- net/tipc/sysctl.c | 6 +- security/keys/sysctl.c | 26 ++-- security/loadpin/loadpin.c | 6 +- security/yama/yama_lsm.c | 3 +- 33 files changed, 270 insertions(+), 322 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index e4b58240ec53..aa738cad1338 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -220,15 +220,13 @@ appldata_timer_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; - int zero = 0; - int one = 1; int rc; struct ctl_table ctl_entry = { .procname = ctl->procname, .data = &timer_active, .maxlen = sizeof(int), - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); @@ -255,13 +253,12 @@ appldata_interval_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; - int one = 1; int rc; struct ctl_table ctl_entry = { .procname = ctl->procname, .data = &interval, .maxlen = sizeof(int), - .extra1 = &one, + .extra1 = SYSCTL_ONE, }; rc = proc_dointvec_minmax(&ctl_entry, write, buffer, lenp, ppos); @@ -289,13 +286,11 @@ appldata_generic_handler(struct ctl_table *ctl, int write, struct list_head *lh; int rc, found; int active; - int zero = 0; - int one = 1; struct ctl_table ctl_entry = { .data = &active, .maxlen = sizeof(int), - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; found = 0; diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 8964a3f60aad..2db6fb405a9a 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -587,15 +587,13 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write, { int enabled = topology_is_enabled(); int new_mode; - int zero = 0; - int one = 1; int rc; struct ctl_table ctl_entry = { .procname = ctl->procname, .data = &enabled, .maxlen = sizeof(int), - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 42d4c89f990e..240626e7f55a 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -65,9 +65,6 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include -static const int zero; -static const int one = 1; - static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", @@ -75,8 +72,8 @@ static struct ctl_table abi_table2[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (int *)&zero, - .extra2 = (int *)&one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, {} }; diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 838cf8a32c49..1cb3ca9bba49 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -65,8 +65,6 @@ static int sched_itmt_update_handler(struct ctl_table *table, int write, return ret; } -static unsigned int zero; -static unsigned int one = 1; static struct ctl_table itmt_kern_table[] = { { .procname = "sched_itmt_enabled", @@ -74,8 +72,8 @@ static struct ctl_table itmt_kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_itmt_update_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, {} }; diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 776dd69cf5be..ba9d30b28edc 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -16,9 +16,6 @@ * firmware fallback configuration table */ -static unsigned int zero; -static unsigned int one = 1; - struct firmware_fallback_config fw_fallback_config = { .force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK), .loading_timeout = 60, @@ -26,6 +23,7 @@ struct firmware_fallback_config fw_fallback_config = { }; EXPORT_SYMBOL_GPL(fw_fallback_config); +#ifdef CONFIG_SYSCTL struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", @@ -33,8 +31,8 @@ struct ctl_table firmware_config_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ignore_sysfs_fallback", @@ -42,9 +40,10 @@ struct ctl_table firmware_config_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; EXPORT_SYMBOL_GPL(firmware_config_table); +#endif diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3d8162d28730..a700c5c3d167 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -274,8 +274,6 @@ #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ -static int zero; -static int one = 1; static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -3366,8 +3364,8 @@ static struct ctl_table oa_table[] = { .maxlen = sizeof(i915_perf_stream_paranoid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "oa_max_sample_rate", @@ -3375,7 +3373,7 @@ static struct ctl_table oa_table[] = { .maxlen = sizeof(i915_oa_max_sample_rate), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &oa_sample_rate_hard_limit, }, {} diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 894da5abdc55..ebd35fc35290 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1197,8 +1197,6 @@ static struct kmsg_dumper hv_kmsg_dumper = { }; static struct ctl_table_header *hv_ctl_table_hdr; -static int zero; -static int one = 1; /* * sysctl option to allow the user to control whether kmsg data should be @@ -1211,8 +1209,8 @@ static struct ctl_table hv_ctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, {} }; diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index fde8d4073e74..4c49f53afa3e 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -855,8 +855,6 @@ void tty_ldisc_deinit(struct tty_struct *tty) tty->ldisc = NULL; } -static int zero; -static int one = 1; static struct ctl_table tty_table[] = { { .procname = "ldisc_autoload", @@ -864,8 +862,8 @@ static struct ctl_table tty_table[] = { .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d37dd5bb7a8f..37a36c6b9f93 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -77,9 +77,6 @@ static int xen_hotplug_unpopulated; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -static int zero; -static int one = 1; - static struct ctl_table balloon_table[] = { { .procname = "hotplug_unpopulated", @@ -87,8 +84,8 @@ static struct ctl_table balloon_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0f9c073d78d5..d7f1f5011fac 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -291,7 +291,7 @@ static LIST_HEAD(tfile_check_list); #include -static long zero; +static long long_zero; static long long_max = LONG_MAX; struct ctl_table epoll_table[] = { @@ -301,7 +301,7 @@ struct ctl_table epoll_table[] = { .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero, + .extra1 = &long_zero, .extra2 = &long_max, }, { } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cce8de32779f..0b815178126e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -45,8 +45,6 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include -static int zero; - struct ctl_table inotify_table[] = { { .procname = "max_user_instances", @@ -54,7 +52,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "max_user_watches", @@ -62,7 +60,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "max_queued_events", @@ -70,7 +68,7 @@ struct ctl_table inotify_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = SYSCTL_ZERO }, { } }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 36ad1b0d6259..d80989b6c344 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -22,6 +22,10 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, INT_MAX }; +EXPORT_SYMBOL(sysctl_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index aadd310769d0..6df477329b76 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -37,6 +37,13 @@ struct ctl_table_root; struct ctl_table_header; struct ctl_dir; +/* Keep the same order as in fs/proc/proc_sysctl.c */ +#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) +#define SYSCTL_ONE ((void *)&sysctl_vals[1]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) + +extern const int sysctl_vals[]; + typedef int proc_handler (struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 2b14ce8ce73f..affd66537e87 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -113,9 +113,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, #define proc_ipc_sem_dointvec NULL #endif -static int zero; -static int one = 1; -static int int_max = INT_MAX; int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; @@ -141,7 +138,7 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { @@ -150,8 +147,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", @@ -159,8 +156,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msgmni", @@ -168,7 +165,7 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { @@ -177,8 +174,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_ipc_auto_msgmni, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmnb", @@ -186,8 +183,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sem", @@ -203,8 +200,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", @@ -212,8 +209,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", @@ -221,8 +218,8 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, #endif {} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d726cef241c..a6a79f85c81a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, } extern int pid_max; -static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, { } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 43186ccfa139..078950d9605b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,9 +125,6 @@ static int sixty = 60; #endif static int __maybe_unused neg_one = -1; - -static int zero; -static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long zero_ul; @@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_schedstats, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ @@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "numa_balancing", @@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_numa_balancing, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ @@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_energy_aware_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_PROVE_LOCKING @@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &neg_one, - .extra2 = &one, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_LATENCYTOP @@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_MODULES @@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_UEVENT_HELPER @@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ten_thousand, }, { @@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "kptr_restrict", @@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, #endif @@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_thresh", @@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &sixty, }, { @@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_cpumask", @@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "softlockup_panic", @@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { @@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif @@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { @@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif @@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "hung_task_check_count", @@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "hung_task_timeout_secs", @@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_proc_update_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "perf_cpu_time_max_percent", @@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &six_hundred_forty_kb, }, { @@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_thousand, }, #endif @@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) { @@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_BPF_SYSCALL @@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, { .procname = "bpf_stats_enabled", @@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_panic_on_rcu_stall), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE @@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = stack_erasing_sysctl, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "dirty_background_ratio", @@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_ratio), .mode = 0644, .proc_handler = dirty_background_ratio_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, .proc_handler = dirty_ratio_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_expire_interval), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "dirtytime_expire_seconds", @@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "swappiness", @@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_swappiness), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE @@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &four, }, #ifdef CONFIG_COMPACTION @@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_COMPACTION */ @@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(min_free_kbytes), .mode = 0644, .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "watermark_boost_factor", @@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(watermark_boost_factor), .mode = 0644, .proc_handler = watermark_boost_factor_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "watermark_scale_factor", @@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(watermark_scale_factor), .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, { @@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_MMU { @@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #else { @@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_nr_trim_pages), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif { @@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(block_dump), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "vfs_cache_pressure", @@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { @@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NUMA @@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "min_unmapped_ratio", @@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_min_unmapped_ratio), .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_min_slab_ratio), .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, #endif @@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = { #endif .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HIGHMEM @@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_MEMORY_FAILURE @@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_memory_failure_early_kill), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "memory_failure_recovery", @@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_memory_failure_recovery), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_unprivileged_userfaultfd), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", @@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", @@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) @@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { } }; @@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_kprobes_optimization_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write, .data = &val, .maxlen = sizeof(val), .mode = table->mode, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/ucount.c b/kernel/ucount.c index feb128c7b5d9..a53cc2b4179c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,16 +52,14 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -static int zero = 0; -static int int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ .mode = 0644, \ .proc_handler = proc_dointvec_minmax, \ - .extra1 = &zero, \ - .extra2 = &int_max, \ + .extra1 = SYSCTL_ZERO, \ + .extra2 = SYSCTL_INT_MAX, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 742cea4ce72e..26da97359d5b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3374,8 +3374,6 @@ void neigh_app_ns(struct neighbour *n) EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL -static int zero; -static int int_max = INT_MAX; static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, @@ -3384,7 +3382,7 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, int size, ret; struct ctl_table tmp = *ctl; - tmp.extra1 = &zero; + tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; @@ -3449,8 +3447,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, struct ctl_table tmp = *ctl; int ret; - tmp.extra1 = &zero; - tmp.extra2 = &int_max; + tmp.extra1 = SYSCTL_ZERO; + tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); @@ -3595,24 +3593,24 @@ static struct neigh_sysctl_table { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, - .extra2 = &int_max, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, {}, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f9204719aeee..8da5b3a54dac 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -22,8 +22,6 @@ #include #include -static int zero = 0; -static int one = 1; static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; @@ -390,10 +388,10 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, # else - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, # endif }, @@ -404,7 +402,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -413,8 +411,8 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, # endif { @@ -461,8 +459,8 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, #ifdef CONFIG_RPS { @@ -493,7 +491,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", @@ -501,7 +499,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NET_SCHED @@ -533,7 +531,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &max_skb_frags, }, { @@ -542,7 +540,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "fb_tunnels_only_for_init_net", @@ -550,8 +548,8 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "devconf_inherit_init_net", @@ -559,7 +557,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -578,7 +576,7 @@ static struct ctl_table netns_core_table[] = { .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index b59040f268a9..ee8d4f5afa72 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -16,9 +16,7 @@ #endif /* Boundary values */ -static int zero = 0, - one = 1, - u8_max = 0xFF; +static int u8_max = 0xFF; static unsigned long seqw_min = DCCPF_SEQ_WMIN, seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ @@ -38,7 +36,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_rx_ccid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, /* RFC 4340, 10. */ }, { @@ -47,7 +45,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_tx_ccid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, /* RFC 4340, 10. */ }, { @@ -56,7 +54,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &u8_max, }, { @@ -65,7 +63,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, }, { @@ -74,7 +72,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &u8_max, }, { @@ -83,7 +81,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "sync_ratelimit", diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7d66306b5f39..0b980e841927 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -28,8 +28,6 @@ #include #include -static int zero; -static int one = 1; static int two = 2; static int four = 4; static int thousand = 1000; @@ -576,7 +574,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "icmp_msgs_burst", @@ -584,7 +582,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "udp_mem", @@ -674,8 +672,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -763,8 +761,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ip_nonlocal_bind", @@ -794,8 +792,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -864,7 +862,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, #endif { @@ -969,7 +967,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1011,7 +1009,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { @@ -1020,8 +1018,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", @@ -1029,8 +1027,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, - .extra1 = &zero, - .extra2 = &two, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1047,8 +1045,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1078,7 +1076,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &four, }, { @@ -1222,7 +1220,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &gso_max_segs, }, { @@ -1231,7 +1229,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_day_secs }, { @@ -1240,8 +1238,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "tcp_invalid_ratelimit", @@ -1256,7 +1254,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &thousand, }, { @@ -1265,7 +1263,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &thousand, }, { @@ -1274,7 +1272,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rmem", @@ -1282,7 +1280,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "tcp_comp_sack_delay_ns", @@ -1297,7 +1295,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &comp_sack_nr_max, }, { @@ -1306,7 +1304,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, { .procname = "udp_wmem_min", @@ -1314,7 +1312,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, { } }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 521e3203e83a..dc73888c7859 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6432,8 +6432,6 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, } static int minus_one = -1; -static const int zero = 0; -static const int one = 1; static const int two_five_five = 255; static const struct ctl_table addrconf_sysctl[] = { @@ -6450,7 +6448,7 @@ static const struct ctl_table addrconf_sysctl[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&two_five_five, }, { @@ -6809,7 +6807,7 @@ static const struct ctl_table addrconf_sysctl[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&zero, + .extra1 = (void *)SYSCTL_ZERO, .extra2 = (void *)&two_five_five, }, { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d2e6b31a8d6..8b0c33fb19a2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6031,9 +6031,6 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, return 0; } -static int zero; -static int one = 1; - static struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", @@ -6111,8 +6108,8 @@ static struct ctl_table ipv6_route_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index dc4c91e0bfb8..ec8fcfc60a27 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -21,8 +21,6 @@ #include #endif -static int zero; -static int one = 1; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -115,7 +113,7 @@ static struct ctl_table ipv6_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &flowlabel_reflect_max, }, { @@ -152,8 +150,8 @@ static struct ctl_table ipv6_table_template[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "seg6_flowlabel", @@ -179,7 +177,7 @@ static struct ctl_table ipv6_rotable[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = SYSCTL_ONE }, #ifdef CONFIG_NETLABEL { diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 198ec4fe4148..c312741df2ce 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -37,8 +37,6 @@ #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) -static int zero = 0; -static int one = 1; static int label_limit = (1 << 20) - 1; static int ttl_max = 255; @@ -2607,7 +2605,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write, .data = &platform_labels, .maxlen = sizeof(int), .mode = table->mode, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &label_limit, }; @@ -2636,8 +2634,8 @@ static const struct ctl_table mpls_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "default_ttl", @@ -2645,7 +2643,7 @@ static const struct ctl_table mpls_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, { } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 07e0967bf129..060565e7d227 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1726,7 +1726,6 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) #ifdef CONFIG_SYSCTL -static int zero; static int three = 3; static int @@ -1935,7 +1934,7 @@ static struct ctl_table vs_vars[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &three, }, { diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 1e3fa67d91aa..2bbb38161851 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,7 +11,6 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; -static const unsigned int one = 1; static const unsigned int four = 4; static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; @@ -97,7 +96,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&rxrpc_max_client_connections, }, { @@ -115,7 +114,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&n_max_acks, }, { @@ -124,7 +123,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&n_65535, }, { @@ -133,7 +132,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, + .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9a19147902f1..1250751bca1b 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -25,10 +25,7 @@ #include #include -static int zero = 0; -static int one = 1; static int timer_max = 86400000; /* ms in one day */ -static int int_max = INT_MAX; static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = SCTP_SCOPE_POLICY_MAX; @@ -92,7 +89,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { @@ -101,7 +98,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_sctp_do_rto_min, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &init_net.sctp.rto_max }, { @@ -137,8 +134,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "cookie_preserve_enable", @@ -160,7 +157,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { @@ -178,7 +175,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { @@ -187,8 +184,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &int_max + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "path_max_retrans", @@ -196,8 +193,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &int_max + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "max_init_retransmits", @@ -205,8 +202,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &int_max + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "pf_retrans", @@ -214,8 +211,8 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sndbuf_policy", @@ -286,7 +283,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &addr_scope_max, }, { @@ -295,7 +292,7 @@ static struct ctl_table sctp_net_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &rwnd_scale_max, }, { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 1f73a6a7e43c..ffb1684c4573 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -80,7 +80,6 @@ static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; static unsigned int min_inline_size = RPCRDMA_MIN_INLINE; static unsigned int max_inline_size = RPCRDMA_MAX_INLINE; -static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; static unsigned int max_memreg = RPCRDMA_LAST - 1; @@ -122,7 +121,7 @@ static struct ctl_table xr_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &max_padding, }, { diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 9df82a573aa7..6159d327db76 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -38,8 +38,6 @@ #include -static int zero; -static int one = 1; static struct ctl_table_header *tipc_ctl_hdr; static struct ctl_table tipc_table[] = { @@ -49,7 +47,7 @@ static struct ctl_table tipc_table[] = { .maxlen = sizeof(sysctl_tipc_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "named_timeout", @@ -57,7 +55,7 @@ static struct ctl_table tipc_table[] = { .maxlen = sizeof(sysctl_tipc_named_timeout), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "sk_filter", diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c index dd1e21fab827..b46b651b3c4c 100644 --- a/security/keys/sysctl.c +++ b/security/keys/sysctl.c @@ -9,8 +9,6 @@ #include #include "internal.h" -static const int zero, one = 1, max = INT_MAX; - struct ctl_table key_sysctls[] = { { .procname = "maxkeys", @@ -18,8 +16,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "maxbytes", @@ -27,8 +25,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "root_maxkeys", @@ -36,8 +34,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "root_maxbytes", @@ -45,8 +43,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &one, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ONE, + .extra2 = (void *) SYSCTL_INT_MAX, }, { .procname = "gc_delay", @@ -54,8 +52,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &zero, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ZERO, + .extra2 = (void *) SYSCTL_INT_MAX, }, #ifdef CONFIG_PERSISTENT_KEYRINGS { @@ -64,8 +62,8 @@ struct ctl_table key_sysctls[] = { .maxlen = sizeof(unsigned), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &zero, - .extra2 = (void *) &max, + .extra1 = (void *) SYSCTL_ZERO, + .extra2 = (void *) SYSCTL_INT_MAX, }, #endif { } diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index 81519c804888..ee5cb944f4ad 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -43,8 +43,6 @@ static struct super_block *pinned_root; static DEFINE_SPINLOCK(pinned_root_spinlock); #ifdef CONFIG_SYSCTL -static int zero; -static int one = 1; static struct ctl_path loadpin_sysctl_path[] = { { .procname = "kernel", }, @@ -59,8 +57,8 @@ static struct ctl_table loadpin_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 01c6239c4493..94dc346370b1 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -445,7 +445,6 @@ static int yama_dointvec_minmax(struct ctl_table *table, int write, return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } -static int zero; static int max_scope = YAMA_SCOPE_NO_ATTACH; static struct ctl_path yama_sysctl_path[] = { @@ -461,7 +460,7 @@ static struct ctl_table yama_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, { } -- cgit v1.2.3-55-g7522 From e320ab3cec7dd8b1606964d81ae1e14391ff8e96 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 19 Jul 2019 03:22:35 +0000 Subject: x86/hyper-v: Zero out the VP ASSIST PAGE on allocation The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section 5.2.1 "GPA Overlay Pages" for the details) and here is an excerpt: "The hypervisor defines several special pages that "overlay" the guest's Guest Physical Addresses (GPA) space. Overlays are addressed GPA but are not included in the normal GPA map maintained internally by the hypervisor. Conceptually, they exist in a separate map that overlays the GPA map. If a page within the GPA space is overlaid, any SPA page mapped to the GPA page is effectively "obscured" and generally unreachable by the virtual processor through processor memory accesses. If an overlay page is disabled, the underlying GPA page is "uncovered", and an existing mapping becomes accessible to the guest." SPA = System Physical Address = the final real physical address. When a CPU (e.g. CPU1) is onlined, hv_cpu_init() allocates the VP ASSIST PAGE and enables the EOI optimization for this CPU by writing the MSR HV_X64_MSR_VP_ASSIST_PAGE. From now on, hvp->apic_assist belongs to the special SPA page, and this CPU *always* uses hvp->apic_assist (which is shared with the hypervisor) to decide if it needs to write the EOI MSR. When a CPU is offlined then on the outgoing CPU: 1. hv_cpu_die() disables the EOI optimizaton for this CPU, and from now on hvp->apic_assist belongs to the original "normal" SPA page; 2. the remaining work of stopping this CPU is done 3. this CPU is completely stopped. Between 1 and 3, this CPU can still receive interrupts (e.g. reschedule IPIs from CPU0, and Local APIC timer interrupts), and this CPU *must* write the EOI MSR for every interrupt received, otherwise the hypervisor may not deliver further interrupts, which may be needed to completely stop the CPU. So, after the EOI optimization is disabled in hv_cpu_die(), it's required that the hvp->apic_assist's bit0 is zero, which is not guaranteed by the current allocation mode because it lacks __GFP_ZERO. As a consequence the bit might be set and interrupt handling would not write the EOI MSR causing interrupt delivery to become stuck. Add the missing __GFP_ZERO to the allocation. Note 1: after the "normal" SPA page is allocted and zeroed out, neither the hypervisor nor the guest writes into the page, so the page remains with zeros. Note 2: see Section 10.3.5 "EOI Assist" for the details of the EOI optimization. When the optimization is enabled, the guest can still write the EOI MSR register irrespective of the "No EOI required" value, but that's slower than the optimized assist based variant. Fixes: ba696429d290 ("x86/hyper-v: Implement EOI assist") Signed-off-by: Dexuan Cui Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/ Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907201020540.1782@nanos.tec.linutronix.de --- arch/x86/entry/entry_64.S | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7cb2e1f1ec09..f7c70c1bee8b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -875,7 +875,12 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt UNWIND_HINT_REGS .if \read_cr2 - GET_CR2_INTO(%rdx); /* can clobber %rax */ + /* + * Store CR2 early so subsequent faults cannot clobber it. Use R12 as + * intermediate storage as RDX can be clobbered in enter_from_user_mode(). + * GET_CR2_INTO can clobber RAX. + */ + GET_CR2_INTO(%r12); .endif .if \shift_ist != -1 @@ -904,6 +909,10 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif + .if \read_cr2 + movq %r12, %rdx /* Move CR2 into 3rd argument */ + .endif + call \do_sym .if \shift_ist != -1 -- cgit v1.2.3-55-g7522