From a2f4870697a5bcf4a87073ec6b32dd2928c1211d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o
Date: Tue, 17 Mar 2015 12:23:19 -0400
Subject: fs: make sure the timestamps for lazytime inodes eventually get
 written

Jan Kara pointed out that if there is an inode which is constantly
getting dirtied with I_DIRTY_PAGES, an inode with an updated timestamp
will never be written since inode->dirtied_when is constantly getting
updated.  We fix this by adding an extra field to the inode,
dirtied_time_when, so inodes with a stale dirtytime can get detected
and handled.

In addition, if we have a dirtytime inode caused by an atime update,
and there is no write activity on the file system, we need to have a
secondary system to make sure these inodes get written out.  We do
this by setting up a second delayed work structure which wakes up the
CPU much more rarely compared to writeback_expire_centisecs.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5e1ff2..f4131e8ead74 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -604,6 +604,7 @@ struct inode {
 	struct mutex		i_mutex;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
+	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
-- 
cgit v1.2.3-55-g7522


From 1efff914afac8a965ad63817ecf8861a927c2ace Mon Sep 17 00:00:00 2001
From: Theodore Ts'o
Date: Tue, 17 Mar 2015 12:23:32 -0400
Subject: fs: add dirtytime_expire_seconds sysctl

Add a tuning knob so we can adjust the dirtytime expiration timeout,
which is very useful for testing lazytime.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c         | 11 +++++++++++
 include/linux/writeback.h |  3 +++
 kernel/sysctl.c           |  8 ++++++++
 3 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2cfcd74faf87..32a8bbd7a9ad 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1188,6 +1188,17 @@ static int __init start_dirtytime_writeback(void)
 }
 __initcall(start_dirtytime_writeback);
 
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write)
+		mod_delayed_work(system_wq, &dirtytime_work, 0);
+	return ret;
+}
+
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 00048339c23e..b2dd371ec0ca 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -130,6 +130,7 @@ extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
+extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
@@ -146,6 +147,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos);
 
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6e0031..ce410bb9f2e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1227,6 +1227,14 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "dirtytime_expire_seconds",
+		.data		= &dirtytime_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
+		.mode		= 0644,
+		.proc_handler	= dirtytime_interval_handler,
+		.extra1		= &zero,
+	},
 	{
 		.procname       = "nr_pdflush_threads",
 		.mode           = 0444 /* read-only */,
-- 
cgit v1.2.3-55-g7522


From a736775db683174269c65c7c5cc8e5ee534e7681 Mon Sep 17 00:00:00 2001
From: Charlie Mooney
Date: Fri, 20 Mar 2015 09:40:17 -0700
Subject: Input: add MT_TOOL_PALM

Currently there are only two "tools" that can be specified by a multi-touch
driver: MT_TOOL_FINGER and MT_TOOL_PEN. In working with Elan (The touch
vendor) and discussing their next-gen devices it seems that it will be
useful to have more tools so that their devices can give the upper layers
of the stack hints as to what is touching the sensor.

In particular they have new experimental firmware that can better
differentiate between palms vs fingertips and would like to plumb a patch
so that we can use their hints in higher-level gesture soft- ware.  The
firmware on the device can reasonably do a better job of palm detection
because it has access to all of the raw sensor readings as opposed to just
the width/pressure/etc that are exposed by the driver.  As such, the
firmware can characterize what a palm looks like in much finer-grained
detail and this change would allow such a device to share its findings with
the kernel.

Signed-off-by: Charlie Mooney <charliemooney@chromium.org>
Acked-by: Peter Hutterer <peter.hutterer@who-t.net>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/input/multi-touch-protocol.txt | 9 ++++++---
 include/uapi/linux/input.h                   | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt
index 7b4f59c09ee2..b85d000faeb4 100644
--- a/Documentation/input/multi-touch-protocol.txt
+++ b/Documentation/input/multi-touch-protocol.txt
@@ -312,9 +312,12 @@ ABS_MT_TOOL_TYPE
 
 The type of approaching tool. A lot of kernel drivers cannot distinguish
 between different tool types, such as a finger or a pen. In such cases, the
-event should be omitted. The protocol currently supports MT_TOOL_FINGER and
-MT_TOOL_PEN [2]. For type B devices, this event is handled by input core;
-drivers should instead use input_mt_report_slot_state().
+event should be omitted. The protocol currently supports MT_TOOL_FINGER,
+MT_TOOL_PEN, and MT_TOOL_PALM [2]. For type B devices, this event is handled
+by input core; drivers should instead use input_mt_report_slot_state().
+A contact's ABS_MT_TOOL_TYPE may change over time while still touching the
+device, because the firmware may not be able to determine which tool is being
+used when it first appears.
 
 ABS_MT_BLOB_ID
 
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index a1d7e931ab72..2320b0ce7579 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -972,7 +972,8 @@ struct input_keymap_entry {
  */
 #define MT_TOOL_FINGER		0
 #define MT_TOOL_PEN		1
-#define MT_TOOL_MAX		1
+#define MT_TOOL_PALM		2
+#define MT_TOOL_MAX		2
 
 /*
  * Values describing the status of a force-feedback effect
-- 
cgit v1.2.3-55-g7522


From 1e9e39f4a29857a396ac7b669d109f697f66695e Mon Sep 17 00:00:00 2001
From: Ben Hutchings
Date: Thu, 26 Feb 2015 19:34:37 +0000
Subject: usbnet: Fix tx_packets stat for FLAG_MULTI_FRAME drivers

Currently the usbnet core does not update the tx_packets statistic for
drivers with FLAG_MULTI_PACKET and there is no hook in the TX
completion path where they could do this.

cdc_ncm and dependent drivers are bumping tx_packets stat on the
transmit path while asix and sr9800 aren't updating it at all.

Add a packet count in struct skb_data so these drivers can fill it
in, initialise it to 1 for other drivers, and add the packet count
to the tx_packets statistic on completion.

Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Tested-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix_common.c |  2 ++
 drivers/net/usb/cdc_ncm.c     |  3 ++-
 drivers/net/usb/sr9800.c      |  1 +
 drivers/net/usb/usbnet.c      |  5 +++--
 include/linux/usb/usbnet.h    | 12 ++++++++++++
 5 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 5c55f11572ba..724a9b50df7a 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -188,6 +188,8 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 		memcpy(skb_tail_pointer(skb), &padbytes, sizeof(padbytes));
 		skb_put(skb, sizeof(padbytes));
 	}
+
+	usbnet_set_skb_tx_stats(skb, 1);
 	return skb;
 }
 
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 80a844e0ae03..70cbea551139 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1172,7 +1172,6 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 
 	/* return skb */
 	ctx->tx_curr_skb = NULL;
-	dev->net->stats.tx_packets += ctx->tx_curr_frame_num;
 
 	/* keep private stats: framing overhead and number of NTBs */
 	ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload;
@@ -1184,6 +1183,8 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	 */
 	dev->net->stats.tx_bytes -= skb_out->len - ctx->tx_curr_frame_payload;
 
+	usbnet_set_skb_tx_stats(skb_out, n);
+
 	return skb_out;
 
 exit_no_skb:
diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c
index b94a0fbb8b3b..7650cdc8fe6b 100644
--- a/drivers/net/usb/sr9800.c
+++ b/drivers/net/usb/sr9800.c
@@ -144,6 +144,7 @@ static struct sk_buff *sr_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 		skb_put(skb, sizeof(padbytes));
 	}
 
+	usbnet_set_skb_tx_stats(skb, 1);
 	return skb;
 }
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 449835f4331e..0f3ff285f6a1 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1188,8 +1188,7 @@ static void tx_complete (struct urb *urb)
 	struct usbnet		*dev = entry->dev;
 
 	if (urb->status == 0) {
-		if (!(dev->driver_info->flags & FLAG_MULTI_PACKET))
-			dev->net->stats.tx_packets++;
+		dev->net->stats.tx_packets += entry->packets;
 		dev->net->stats.tx_bytes += entry->length;
 	} else {
 		dev->net->stats.tx_errors++;
@@ -1348,6 +1347,8 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 			urb->transfer_flags |= URB_ZERO_PACKET;
 	}
 	entry->length = urb->transfer_buffer_length = length;
+	if (!(info->flags & FLAG_MULTI_PACKET))
+		usbnet_set_skb_tx_stats(skb, 1);
 
 	spin_lock_irqsave(&dev->txq.lock, flags);
 	retval = usb_autopm_get_interface_async(dev->intf);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index d9a4905e01d0..ff3fb2bd0e90 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -228,8 +228,20 @@ struct skb_data {	/* skb->cb is one of these */
 	struct usbnet		*dev;
 	enum skb_state		state;
 	size_t			length;
+	unsigned long		packets;
 };
 
+/* Drivers that set FLAG_MULTI_PACKET must call this in their
+ * tx_fixup method before returning an skb.
+ */
+static inline void
+usbnet_set_skb_tx_stats(struct sk_buff *skb, unsigned long packets)
+{
+	struct skb_data *entry = (struct skb_data *) skb->cb;
+
+	entry->packets = packets;
+}
+
 extern int usbnet_open(struct net_device *net);
 extern int usbnet_stop(struct net_device *net);
 extern netdev_tx_t usbnet_start_xmit(struct sk_buff *skb,
-- 
cgit v1.2.3-55-g7522


From 7a1e890e2168e33fb62d84528e996b8b4b478fea Mon Sep 17 00:00:00 2001
From: Ben Hutchings
Date: Wed, 25 Mar 2015 21:41:33 +0100
Subject: usbnet: Fix tx_bytes statistic running backward in cdc_ncm

cdc_ncm disagrees with usbnet about how much framing overhead should
be counted in the tx_bytes statistics, and tries 'fix' this by
decrementing tx_bytes on the transmit path.  But statistics must never
be decremented except due to roll-over; this will thoroughly confuse
user-space.  Also, tx_bytes is only incremented by usbnet in the
completion path.

Fix this by requiring drivers that set FLAG_MULTI_FRAME to set a
tx_bytes delta along with the tx_packets count.

Fixes: beeecd42c3b4 ("net: cdc_ncm/cdc_mbim: adding NCM protocol statistics")
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
---
 drivers/net/usb/asix_common.c |  2 +-
 drivers/net/usb/cdc_ncm.c     |  7 +++----
 drivers/net/usb/sr9800.c      |  2 +-
 drivers/net/usb/usbnet.c      | 16 +++++++++++++---
 include/linux/usb/usbnet.h    |  6 ++++--
 5 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 724a9b50df7a..75d6f26729a3 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -189,7 +189,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 		skb_put(skb, sizeof(padbytes));
 	}
 
-	usbnet_set_skb_tx_stats(skb, 1);
+	usbnet_set_skb_tx_stats(skb, 1, 0);
 	return skb;
 }
 
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 70cbea551139..c3e4da9e79ca 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1177,13 +1177,12 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload;
 	ctx->tx_ntbs++;
 
-	/* usbnet has already counted all the framing overhead.
+	/* usbnet will count all the framing overhead by default.
 	 * Adjust the stats so that the tx_bytes counter show real
 	 * payload data instead.
 	 */
-	dev->net->stats.tx_bytes -= skb_out->len - ctx->tx_curr_frame_payload;
-
-	usbnet_set_skb_tx_stats(skb_out, n);
+	usbnet_set_skb_tx_stats(skb_out, n,
+				ctx->tx_curr_frame_payload - skb_out->len);
 
 	return skb_out;
 
diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c
index 7650cdc8fe6b..953de13267df 100644
--- a/drivers/net/usb/sr9800.c
+++ b/drivers/net/usb/sr9800.c
@@ -144,7 +144,7 @@ static struct sk_buff *sr_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 		skb_put(skb, sizeof(padbytes));
 	}
 
-	usbnet_set_skb_tx_stats(skb, 1);
+	usbnet_set_skb_tx_stats(skb, 1, 0);
 	return skb;
 }
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 0f3ff285f6a1..777757ae1973 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1346,9 +1346,19 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 		} else
 			urb->transfer_flags |= URB_ZERO_PACKET;
 	}
-	entry->length = urb->transfer_buffer_length = length;
-	if (!(info->flags & FLAG_MULTI_PACKET))
-		usbnet_set_skb_tx_stats(skb, 1);
+	urb->transfer_buffer_length = length;
+
+	if (info->flags & FLAG_MULTI_PACKET) {
+		/* Driver has set number of packets and a length delta.
+		 * Calculate the complete length and ensure that it's
+		 * positive.
+		 */
+		entry->length += length;
+		if (WARN_ON_ONCE(entry->length <= 0))
+			entry->length = length;
+	} else {
+		usbnet_set_skb_tx_stats(skb, 1, length);
+	}
 
 	spin_lock_irqsave(&dev->txq.lock, flags);
 	retval = usb_autopm_get_interface_async(dev->intf);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index ff3fb2bd0e90..6e0ce8c7b8cb 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -227,7 +227,7 @@ struct skb_data {	/* skb->cb is one of these */
 	struct urb		*urb;
 	struct usbnet		*dev;
 	enum skb_state		state;
-	size_t			length;
+	long			length;
 	unsigned long		packets;
 };
 
@@ -235,11 +235,13 @@ struct skb_data {	/* skb->cb is one of these */
  * tx_fixup method before returning an skb.
  */
 static inline void
-usbnet_set_skb_tx_stats(struct sk_buff *skb, unsigned long packets)
+usbnet_set_skb_tx_stats(struct sk_buff *skb,
+			unsigned long packets, long bytes_delta)
 {
 	struct skb_data *entry = (struct skb_data *) skb->cb;
 
 	entry->packets = packets;
+	entry->length = bytes_delta;
 }
 
 extern int usbnet_open(struct net_device *net);
-- 
cgit v1.2.3-55-g7522


From 4ad3e3634a6cbe916722c7113c5b488d52c7a3dc Mon Sep 17 00:00:00 2001
From: Marc Zyngier
Date: Fri, 27 Mar 2015 14:15:04 +0000
Subject: irqchip: gicv3-its: Fix PROP/PEND and BASE/CBASE confusion

The ITS driver sometime mixes up the use of GICR_PROPBASE bitfields
for the GICR_PENDBASE register, and GITS_BASER for GICR_CBASE.

This does not lead to any observable bug because similar bits are
at the same location, but this just make the code even harder to
understand...

This patch provides the required #defines and fixes the mixup.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1427465705-17126-4-git-send-email-marc.zyngier@arm.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>
---
 drivers/irqchip/irq-gic-v3-its.c   |  6 +++---
 include/linux/irqchip/arm-gic-v3.h | 13 +++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index fa0c43660c8b..56353f6b5952 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -986,8 +986,8 @@ static void its_cpu_init_lpis(void)
 
 	/* set PENDBASE */
 	val = (page_to_phys(pend_page) |
-	       GICR_PROPBASER_InnerShareable |
-	       GICR_PROPBASER_WaWb);
+	       GICR_PENDBASER_InnerShareable |
+	       GICR_PENDBASER_WaWb);
 
 	writeq_relaxed(val, rbase + GICR_PENDBASER);
 
@@ -1425,7 +1425,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	writeq_relaxed(0, its->base + GITS_CWRITER);
 	writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
 
-	if ((tmp ^ baser) & GITS_BASER_SHAREABILITY_MASK) {
+	if ((tmp ^ baser) & GITS_CBASER_SHAREABILITY_MASK) {
 		pr_info("ITS: using cache flushing for cmd queue\n");
 		its->flags |= ITS_FLAGS_CMDQ_NEEDS_FLUSHING;
 	}
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 781974afff9f..826a4bd63d4a 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -128,6 +128,19 @@
 #define GICR_PROPBASER_RaWaWb		(7U << 7)
 #define GICR_PROPBASER_IDBITS_MASK	(0x1f)
 
+#define GICR_PENDBASER_NonShareable	(0U << 10)
+#define GICR_PENDBASER_InnerShareable	(1U << 10)
+#define GICR_PENDBASER_OuterShareable	(2U << 10)
+#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
+#define GICR_PENDBASER_nCnB		(0U << 7)
+#define GICR_PENDBASER_nC		(1U << 7)
+#define GICR_PENDBASER_RaWt		(2U << 7)
+#define GICR_PENDBASER_RaWb		(3U << 7)
+#define GICR_PENDBASER_WaWt		(4U << 7)
+#define GICR_PENDBASER_WaWb		(5U << 7)
+#define GICR_PENDBASER_RaWaWt		(6U << 7)
+#define GICR_PENDBASER_RaWaWb		(7U << 7)
+
 /*
  * Re-Distributor registers, offsets from SGI_base
  */
-- 
cgit v1.2.3-55-g7522


From 241a386c7dbb8b0db400a1f92f2ebe3b10eb661d Mon Sep 17 00:00:00 2001
From: Marc Zyngier
Date: Fri, 27 Mar 2015 14:15:05 +0000
Subject: irqchip: gicv3-its: Use non-cacheable accesses when no shareability

If the ITS or the redistributors report their shareability as zero,
then it is important to make sure they will no generate any cacheable
traffic, as this is unlikely to produce the expected result.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1427465705-17126-5-git-send-email-marc.zyngier@arm.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>
---
 drivers/irqchip/irq-gic-v3-its.c   | 47 ++++++++++++++++++++++++++++++++++----
 include/linux/irqchip/arm-gic-v3.h |  4 ++++
 2 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 56353f6b5952..9687f8afebff 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -802,6 +802,7 @@ static int its_alloc_tables(struct its_node *its)
 	int i;
 	int psz = SZ_64K;
 	u64 shr = GITS_BASER_InnerShareable;
+	u64 cache = GITS_BASER_WaWb;
 
 	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
 		u64 val = readq_relaxed(its->base + GITS_BASER + i * 8);
@@ -848,7 +849,7 @@ retry_baser:
 		val = (virt_to_phys(base) 				 |
 		       (type << GITS_BASER_TYPE_SHIFT)			 |
 		       ((entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) |
-		       GITS_BASER_WaWb					 |
+		       cache						 |
 		       shr						 |
 		       GITS_BASER_VALID);
 
@@ -874,9 +875,12 @@ retry_baser:
 			 * Shareability didn't stick. Just use
 			 * whatever the read reported, which is likely
 			 * to be the only thing this redistributor
-			 * supports.
+			 * supports. If that's zero, make it
+			 * non-cacheable as well.
 			 */
 			shr = tmp & GITS_BASER_SHAREABILITY_MASK;
+			if (!shr)
+				cache = GITS_BASER_nC;
 			goto retry_baser;
 		}
 
@@ -980,6 +984,17 @@ static void its_cpu_init_lpis(void)
 	tmp = readq_relaxed(rbase + GICR_PROPBASER);
 
 	if ((tmp ^ val) & GICR_PROPBASER_SHAREABILITY_MASK) {
+		if (!(tmp & GICR_PROPBASER_SHAREABILITY_MASK)) {
+			/*
+			 * The HW reports non-shareable, we must
+			 * remove the cacheability attributes as
+			 * well.
+			 */
+			val &= ~(GICR_PROPBASER_SHAREABILITY_MASK |
+				 GICR_PROPBASER_CACHEABILITY_MASK);
+			val |= GICR_PROPBASER_nC;
+			writeq_relaxed(val, rbase + GICR_PROPBASER);
+		}
 		pr_info_once("GIC: using cache flushing for LPI property table\n");
 		gic_rdists->flags |= RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING;
 	}
@@ -990,6 +1005,18 @@ static void its_cpu_init_lpis(void)
 	       GICR_PENDBASER_WaWb);
 
 	writeq_relaxed(val, rbase + GICR_PENDBASER);
+	tmp = readq_relaxed(rbase + GICR_PENDBASER);
+
+	if (!(tmp & GICR_PENDBASER_SHAREABILITY_MASK)) {
+		/*
+		 * The HW reports non-shareable, we must remove the
+		 * cacheability attributes as well.
+		 */
+		val &= ~(GICR_PENDBASER_SHAREABILITY_MASK |
+			 GICR_PENDBASER_CACHEABILITY_MASK);
+		val |= GICR_PENDBASER_nC;
+		writeq_relaxed(val, rbase + GICR_PENDBASER);
+	}
 
 	/* Enable LPIs */
 	val = readl_relaxed(rbase + GICR_CTLR);
@@ -1422,14 +1449,26 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 
 	writeq_relaxed(baser, its->base + GITS_CBASER);
 	tmp = readq_relaxed(its->base + GITS_CBASER);
-	writeq_relaxed(0, its->base + GITS_CWRITER);
-	writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
 
 	if ((tmp ^ baser) & GITS_CBASER_SHAREABILITY_MASK) {
+		if (!(tmp & GITS_CBASER_SHAREABILITY_MASK)) {
+			/*
+			 * The HW reports non-shareable, we must
+			 * remove the cacheability attributes as
+			 * well.
+			 */
+			baser &= ~(GITS_CBASER_SHAREABILITY_MASK |
+				   GITS_CBASER_CACHEABILITY_MASK);
+			baser |= GITS_CBASER_nC;
+			writeq_relaxed(baser, its->base + GITS_CBASER);
+		}
 		pr_info("ITS: using cache flushing for cmd queue\n");
 		its->flags |= ITS_FLAGS_CMDQ_NEEDS_FLUSHING;
 	}
 
+	writeq_relaxed(0, its->base + GITS_CWRITER);
+	writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
+
 	if (of_property_read_bool(its->msi_chip.of_node, "msi-controller")) {
 		its->domain = irq_domain_add_tree(NULL, &its_domain_ops, its);
 		if (!its->domain) {
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 826a4bd63d4a..ffbc034c8810 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -126,6 +126,7 @@
 #define GICR_PROPBASER_WaWb		(5U << 7)
 #define GICR_PROPBASER_RaWaWt		(6U << 7)
 #define GICR_PROPBASER_RaWaWb		(7U << 7)
+#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
 #define GICR_PROPBASER_IDBITS_MASK	(0x1f)
 
 #define GICR_PENDBASER_NonShareable	(0U << 10)
@@ -140,6 +141,7 @@
 #define GICR_PENDBASER_WaWb		(5U << 7)
 #define GICR_PENDBASER_RaWaWt		(6U << 7)
 #define GICR_PENDBASER_RaWaWb		(7U << 7)
+#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
 
 /*
  * Re-Distributor registers, offsets from SGI_base
@@ -195,6 +197,7 @@
 #define GITS_CBASER_WaWb		(5UL << 59)
 #define GITS_CBASER_RaWaWt		(6UL << 59)
 #define GITS_CBASER_RaWaWb		(7UL << 59)
+#define GITS_CBASER_CACHEABILITY_MASK	(7UL << 59)
 #define GITS_CBASER_NonShareable	(0UL << 10)
 #define GITS_CBASER_InnerShareable	(1UL << 10)
 #define GITS_CBASER_OuterShareable	(2UL << 10)
@@ -211,6 +214,7 @@
 #define GITS_BASER_WaWb			(5UL << 59)
 #define GITS_BASER_RaWaWt		(6UL << 59)
 #define GITS_BASER_RaWaWb		(7UL << 59)
+#define GITS_BASER_CACHEABILITY_MASK	(7UL << 59)
 #define GITS_BASER_TYPE_SHIFT		(56)
 #define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
 #define GITS_BASER_ENTRY_SIZE_SHIFT	(48)
-- 
cgit v1.2.3-55-g7522


From f3f03330dee0526d82f2a0fd1a79d207ed1ac439 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 30 Mar 2015 18:46:29 +0200
Subject: nfsd: require an explicit option to enable pNFS

Turns out sending out layouts to any client is a bad idea if they
can't get at the storage device, so require explicit admin action
to enable pNFS.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4layouts.c            | 2 +-
 include/uapi/linux/nfsd/export.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 80e236bf79fc..6904213a4363 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -118,7 +118,7 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 {
 	struct super_block *sb = exp->ex_path.mnt->mnt_sb;
 
-	if (exp->ex_flags & NFSEXP_NOPNFS)
+	if (!(exp->ex_flags & NFSEXP_PNFS))
 		return;
 
 	if (sb->s_export_op->get_uuid &&
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 4742f2cb42f2..d3bd6ffec041 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,7 +47,7 @@
  * exported filesystem.
  */
 #define	NFSEXP_V4ROOT		0x10000
-#define NFSEXP_NOPNFS		0x20000
+#define NFSEXP_PNFS		0x20000
 
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
 #define NFSEXP_ALLFLAGS		0x3FE7F
-- 
cgit v1.2.3-55-g7522


From e9637415a92cf25ad800b7fdeddcd30cce7b44ab Mon Sep 17 00:00:00 2001
From: Mike Snitzer
Date: Mon, 30 Mar 2015 13:39:09 -0400
Subject: block: fix blk_stack_limits() regression due to lcm() change

Linux 3.19 commit 69c953c ("lib/lcm.c: lcm(n,0)=lcm(0,n) is 0, not n")
caused blk_stack_limits() to not properly stack queue_limits for stacked
devices (e.g. DM).

Fix this regression by establishing lcm_not_zero() and switching
blk_stack_limits() over to using it.

DM uses blk_set_stacking_limits() to establish the initial top-level
queue_limits that are then built up based on underlying devices' limits
using blk_stack_limits().  In the case of optimal_io_size (io_opt)
blk_set_stacking_limits() establishes a default value of 0.  With commit
69c953c, lcm(0, n) is no longer n, which compromises proper stacking of
the underlying devices' io_opt.

Test:
$ modprobe scsi_debug dev_size_mb=10 num_tgts=1 opt_blks=1536
$ cat /sys/block/sde/queue/optimal_io_size
786432
$ dmsetup create node --table "0 100 linear /dev/sde 0"

Before this fix:
$ cat /sys/block/dm-5/queue/optimal_io_size
0

After this fix:
$ cat /sys/block/dm-5/queue/optimal_io_size
786432

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.19+
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-settings.c |  6 +++---
 include/linux/lcm.h  |  1 +
 lib/lcm.c            | 11 +++++++++++
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ed2cbe5e8c9..12600bfffca9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -585,7 +585,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 				     b->physical_block_size);
 
 	t->io_min = max(t->io_min, b->io_min);
-	t->io_opt = lcm(t->io_opt, b->io_opt);
+	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
 	t->cluster &= b->cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
@@ -616,7 +616,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		    b->raid_partial_stripes_expensive);
 
 	/* Find lowest common alignment_offset */
-	t->alignment_offset = lcm(t->alignment_offset, alignment)
+	t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment)
 		% max(t->physical_block_size, t->io_min);
 
 	/* Verify that new alignment_offset is on a logical block boundary */
@@ -643,7 +643,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 						      b->max_discard_sectors);
 		t->discard_granularity = max(t->discard_granularity,
 					     b->discard_granularity);
-		t->discard_alignment = lcm(t->discard_alignment, alignment) %
+		t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
 			t->discard_granularity;
 	}
 
diff --git a/include/linux/lcm.h b/include/linux/lcm.h
index 7bf01d779b45..1ce79a7f1daa 100644
--- a/include/linux/lcm.h
+++ b/include/linux/lcm.h
@@ -4,5 +4,6 @@
 #include <linux/compiler.h>
 
 unsigned long lcm(unsigned long a, unsigned long b) __attribute_const__;
+unsigned long lcm_not_zero(unsigned long a, unsigned long b) __attribute_const__;
 
 #endif /* _LCM_H */
diff --git a/lib/lcm.c b/lib/lcm.c
index e97dbd51e756..03d7fcb420b5 100644
--- a/lib/lcm.c
+++ b/lib/lcm.c
@@ -12,3 +12,14 @@ unsigned long lcm(unsigned long a, unsigned long b)
 		return 0;
 }
 EXPORT_SYMBOL_GPL(lcm);
+
+unsigned long lcm_not_zero(unsigned long a, unsigned long b)
+{
+	unsigned long l = lcm(a, b);
+
+	if (l)
+		return l;
+
+	return (b ? : a);
+}
+EXPORT_SYMBOL_GPL(lcm_not_zero);
-- 
cgit v1.2.3-55-g7522


From f9c72d10d6fbf949558cd088389a42213ed7b12d Mon Sep 17 00:00:00 2001
From: Jeff Layton
Date: Tue, 31 Mar 2015 12:03:28 -0400
Subject: sunrpc: make debugfs file creation failure non-fatal

We currently have a problem that SELinux policy is being enforced when
creating debugfs files. If a debugfs file is created as a side effect of
doing some syscall, then that creation can fail if the SELinux policy
for that process prevents it.

This seems wrong. We don't do that for files under /proc, for instance,
so Bruce has proposed a patch to fix that.

While discussing that patch however, Greg K.H. stated:

    "No kernel code should care / fail if a debugfs function fails, so
     please fix up the sunrpc code first."

This patch converts all of the sunrpc debugfs setup code to be void
return functins, and the callers to not look for errors from those
functions.

This should allow rpc_clnt and rpc_xprt creation to work, even if the
kernel fails to create debugfs files for some reason.

Symptoms were failing krb5 mounts on systems using gss-proxy and
selinux.

Fixes: 388f0c776781 "sunrpc: add a debugfs rpc_xprt directory..."
Cc: stable@vger.kernel.org
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/debug.h | 18 +++++++--------
 net/sunrpc/clnt.c            |  4 +---
 net/sunrpc/debugfs.c         | 52 ++++++++++++++++++++++++--------------------
 net/sunrpc/sunrpc_syms.c     |  7 +-----
 net/sunrpc/xprt.c            |  7 +-----
 5 files changed, 41 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index c57d8ea0716c..59a7889e15db 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -60,17 +60,17 @@ struct rpc_xprt;
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 void		rpc_register_sysctl(void);
 void		rpc_unregister_sysctl(void);
-int		sunrpc_debugfs_init(void);
+void		sunrpc_debugfs_init(void);
 void		sunrpc_debugfs_exit(void);
-int		rpc_clnt_debugfs_register(struct rpc_clnt *);
+void		rpc_clnt_debugfs_register(struct rpc_clnt *);
 void		rpc_clnt_debugfs_unregister(struct rpc_clnt *);
-int		rpc_xprt_debugfs_register(struct rpc_xprt *);
+void		rpc_xprt_debugfs_register(struct rpc_xprt *);
 void		rpc_xprt_debugfs_unregister(struct rpc_xprt *);
 #else
-static inline int
+static inline void
 sunrpc_debugfs_init(void)
 {
-	return 0;
+	return;
 }
 
 static inline void
@@ -79,10 +79,10 @@ sunrpc_debugfs_exit(void)
 	return;
 }
 
-static inline int
+static inline void
 rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 {
-	return 0;
+	return;
 }
 
 static inline void
@@ -91,10 +91,10 @@ rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt)
 	return;
 }
 
-static inline int
+static inline void
 rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 {
-	return 0;
+	return;
 }
 
 static inline void
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 612aa73bbc60..e6ce1517367f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -303,9 +303,7 @@ static int rpc_client_register(struct rpc_clnt *clnt,
 	struct super_block *pipefs_sb;
 	int err;
 
-	err = rpc_clnt_debugfs_register(clnt);
-	if (err)
-		return err;
+	rpc_clnt_debugfs_register(clnt);
 
 	pipefs_sb = rpc_get_sb_net(net);
 	if (pipefs_sb) {
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index e811f390f9f6..82962f7e6e88 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -129,48 +129,52 @@ static const struct file_operations tasks_fops = {
 	.release	= tasks_release,
 };
 
-int
+void
 rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 {
-	int len, err;
+	int len;
 	char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
+	struct rpc_xprt *xprt;
 
 	/* Already registered? */
-	if (clnt->cl_debugfs)
-		return 0;
+	if (clnt->cl_debugfs || !rpc_clnt_dir)
+		return;
 
 	len = snprintf(name, sizeof(name), "%x", clnt->cl_clid);
 	if (len >= sizeof(name))
-		return -EINVAL;
+		return;
 
 	/* make the per-client dir */
 	clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir);
 	if (!clnt->cl_debugfs)
-		return -ENOMEM;
+		return;
 
 	/* make tasks file */
-	err = -ENOMEM;
 	if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs,
 				 clnt, &tasks_fops))
 		goto out_err;
 
-	err = -EINVAL;
 	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	/* no "debugfs" dentry? Don't bother with the symlink. */
+	if (!xprt->debugfs) {
+		rcu_read_unlock();
+		return;
+	}
 	len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
-			rcu_dereference(clnt->cl_xprt)->debugfs->d_name.name);
+			xprt->debugfs->d_name.name);
 	rcu_read_unlock();
+
 	if (len >= sizeof(name))
 		goto out_err;
 
-	err = -ENOMEM;
 	if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name))
 		goto out_err;
 
-	return 0;
+	return;
 out_err:
 	debugfs_remove_recursive(clnt->cl_debugfs);
 	clnt->cl_debugfs = NULL;
-	return err;
 }
 
 void
@@ -226,33 +230,33 @@ static const struct file_operations xprt_info_fops = {
 	.release	= xprt_info_release,
 };
 
-int
+void
 rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 {
 	int len, id;
 	static atomic_t	cur_id;
 	char		name[9]; /* 8 hex digits + NULL term */
 
+	if (!rpc_xprt_dir)
+		return;
+
 	id = (unsigned int)atomic_inc_return(&cur_id);
 
 	len = snprintf(name, sizeof(name), "%x", id);
 	if (len >= sizeof(name))
-		return -EINVAL;
+		return;
 
 	/* make the per-client dir */
 	xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir);
 	if (!xprt->debugfs)
-		return -ENOMEM;
+		return;
 
 	/* make tasks file */
 	if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs,
 				 xprt, &xprt_info_fops)) {
 		debugfs_remove_recursive(xprt->debugfs);
 		xprt->debugfs = NULL;
-		return -ENOMEM;
 	}
-
-	return 0;
 }
 
 void
@@ -266,14 +270,17 @@ void __exit
 sunrpc_debugfs_exit(void)
 {
 	debugfs_remove_recursive(topdir);
+	topdir = NULL;
+	rpc_clnt_dir = NULL;
+	rpc_xprt_dir = NULL;
 }
 
-int __init
+void __init
 sunrpc_debugfs_init(void)
 {
 	topdir = debugfs_create_dir("sunrpc", NULL);
 	if (!topdir)
-		goto out;
+		return;
 
 	rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
 	if (!rpc_clnt_dir)
@@ -283,10 +290,9 @@ sunrpc_debugfs_init(void)
 	if (!rpc_xprt_dir)
 		goto out_remove;
 
-	return 0;
+	return;
 out_remove:
 	debugfs_remove_recursive(topdir);
 	topdir = NULL;
-out:
-	return -ENOMEM;
+	rpc_clnt_dir = NULL;
 }
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index e37fbed87956..ee5d3d253102 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -98,10 +98,7 @@ init_sunrpc(void)
 	if (err)
 		goto out4;
 
-	err = sunrpc_debugfs_init();
-	if (err)
-		goto out5;
-
+	sunrpc_debugfs_init();
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	rpc_register_sysctl();
 #endif
@@ -109,8 +106,6 @@ init_sunrpc(void)
 	init_socket_xprt();	/* clnt sock transport */
 	return 0;
 
-out5:
-	unregister_rpc_pipefs();
 out4:
 	unregister_pernet_subsys(&sunrpc_net_ops);
 out3:
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e3015aede0d9..9949722d99ce 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1331,7 +1331,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
  */
 struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
-	int err;
 	struct rpc_xprt	*xprt;
 	struct xprt_class *t;
 
@@ -1372,11 +1371,7 @@ found:
 		return ERR_PTR(-ENOMEM);
 	}
 
-	err = rpc_xprt_debugfs_register(xprt);
-	if (err) {
-		xprt_destroy(xprt);
-		return ERR_PTR(err);
-	}
+	rpc_xprt_debugfs_register(xprt);
 
 	dprintk("RPC:       created transport %p with %u slots\n", xprt,
 			xprt->max_reqs);
-- 
cgit v1.2.3-55-g7522


From f60e5990d9c1424af9dbca60a23ba2a1c7c1ce90 Mon Sep 17 00:00:00 2001
From: hannes@stressinduktion.org
Date: Wed, 1 Apr 2015 17:07:44 +0200
Subject: ipv6: protect skb->sk accesses from recursive dereference inside the
 stack

We should not consult skb->sk for output decisions in xmit recursion
levels > 0 in the stack. Otherwise local socket settings could influence
the result of e.g. tunnel encapsulation process.

ipv6 does not conform with this in three places:

1) ip6_fragment: we do consult ipv6_npinfo for frag_size

2) sk_mc_loop in ipv6 uses skb->sk and checks if we should
   loop the packet back to the local socket

3) ip6_skb_dst_mtu could query the settings from the user socket and
   force a wrong MTU

Furthermore:
In sk_mc_loop we could potentially land in WARN_ON(1) if we use a
PF_PACKET socket ontop of an IPv6-backed vxlan device.

Reuse xmit_recursion as we are currently only interested in protecting
tunnel devices.

Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++++++
 include/net/ip.h          | 16 ----------------
 include/net/ip6_route.h   |  3 ++-
 include/net/sock.h        |  2 ++
 net/core/dev.c            |  4 +++-
 net/core/sock.c           | 19 +++++++++++++++++++
 net/ipv6/ip6_output.c     |  3 ++-
 7 files changed, 34 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dcf6ec27739b..278738873703 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2185,6 +2185,12 @@ void netdev_freemem(struct net_device *dev);
 void synchronize_net(void);
 int init_dummy_netdev(struct net_device *dev);
 
+DECLARE_PER_CPU(int, xmit_recursion);
+static inline int dev_recursion_level(void)
+{
+	return this_cpu_read(xmit_recursion);
+}
+
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
diff --git a/include/net/ip.h b/include/net/ip.h
index 025c61c0dffb..6cc1eafb153a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -453,22 +453,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk)
 
 #endif
 
-static inline int sk_mc_loop(struct sock *sk)
-{
-	if (!sk)
-		return 1;
-	switch (sk->sk_family) {
-	case AF_INET:
-		return inet_sk(sk)->mc_loop;
-#if IS_ENABLED(CONFIG_IPV6)
-	case AF_INET6:
-		return inet6_sk(sk)->mc_loop;
-#endif
-	}
-	WARN_ON(1);
-	return 1;
-}
-
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 1d09b46c1e48..eda131d179d9 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+				inet6_sk(skb->sk) : NULL;
 
 	return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ?
 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
diff --git a/include/net/sock.h b/include/net/sock.h
index ab186b1d31ff..e4079c28e6b8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+bool sk_mc_loop(struct sock *sk);
+
 static inline bool sk_can_gso(const struct sock *sk)
 {
 	return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
diff --git a/net/core/dev.c b/net/core/dev.c
index 962ee9d71964..45109b70664e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb)
 #define skb_update_prio(skb)
 #endif
 
-static DEFINE_PER_CPU(int, xmit_recursion);
+DEFINE_PER_CPU(int, xmit_recursion);
+EXPORT_SYMBOL(xmit_recursion);
+
 #define RECURSION_LIMIT 10
 
 /**
diff --git a/net/core/sock.c b/net/core/sock.c
index 78e89eb7eb70..71e3e5f1eaa0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 		sock_reset_flag(sk, bit);
 }
 
+bool sk_mc_loop(struct sock *sk)
+{
+	if (dev_recursion_level())
+		return false;
+	if (!sk)
+		return true;
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return inet6_sk(sk)->mc_loop;
+#endif
+	}
+	WARN_ON(1);
+	return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e80b61b51ff..36cf0ab685a0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+				inet6_sk(skb->sk) : NULL;
 	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
 	unsigned int mtu, hlen, left, len;
-- 
cgit v1.2.3-55-g7522