summaryrefslogtreecommitdiffstats
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c245
1 files changed, 182 insertions, 63 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 1796cef55ab5..6778a9999d52 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
#include "net-sysfs.h"
@@ -468,10 +469,14 @@ EXPORT_SYMBOL(dev_remove_pack);
*/
void dev_add_offload(struct packet_offload *po)
{
- struct list_head *head = &offload_base;
+ struct packet_offload *elem;
spin_lock(&offload_lock);
- list_add_rcu(&po->list, head);
+ list_for_each_entry(elem, &offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
@@ -1630,7 +1635,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
}
EXPORT_SYMBOL(call_netdevice_notifiers);
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
static struct static_key ingress_needed __read_mostly;
void net_inc_ingress_queue(void)
@@ -1718,15 +1723,8 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- }
-
- if (unlikely(!is_skb_forwardable(dev, skb))) {
+ if (skb_orphan_frags(skb, GFP_ATOMIC) ||
+ unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
@@ -2350,6 +2348,34 @@ void netif_device_attach(struct net_device *dev)
}
EXPORT_SYMBOL(netif_device_attach);
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+ unsigned int num_tx_queues)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
+ return hash;
+ }
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features = 0;
@@ -2908,6 +2934,84 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_loopback_xmit);
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[skb->sender_cpu - 1]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+ map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, skb);
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, skb);
+
+ if (queue_index != new_index && sk &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv)
+{
+ int queue_index = 0;
+
+#ifdef CONFIG_XPS
+ if (skb->sender_cpu == 0)
+ skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb);
+
+ if (!accel_priv)
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
+
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
@@ -3079,7 +3183,7 @@ static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
- if (next_cpu != RPS_NO_CPU) {
+ if (next_cpu < nr_cpu_ids) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
@@ -3184,7 +3288,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
- * - Current CPU is unset (equal to RPS_NO_CPU).
+ * - Current CPU is unset (>= nr_cpu_ids).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
@@ -3192,14 +3296,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) &&
- (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
- if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
@@ -3240,14 +3344,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- int cpu;
+ unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = ACCESS_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
((int)(per_cpu(softnet_data, cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
@@ -3520,66 +3624,47 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-#ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesn't stop any functionality; if you dont have
- * the ingress scheduler, you just can't add policies on ingress.
- *
- */
-static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
-{
- struct net_device *dev = skb->dev;
- u32 ttl = G_TC_RTTL(skb->tc_verd);
- int result = TC_ACT_OK;
- struct Qdisc *q;
-
- if (unlikely(MAX_RED_LOOP < ttl++)) {
- net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
- skb->skb_iif, dev->ifindex);
- return TC_ACT_SHOT;
- }
-
- skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
- q = rcu_dereference(rxq->qdisc);
- if (q != &noop_qdisc) {
- spin_lock(qdisc_lock(q));
- if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
- result = qdisc_enqueue_root(skb, q);
- spin_unlock(qdisc_lock(q));
- }
-
- return result;
-}
-
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
- struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+#ifdef CONFIG_NET_CLS_ACT
+ struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+ struct tcf_result cl_res;
- if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
+ /* If there's at least one ingress present somewhere (so
+ * we get here via enabled static key), remaining devices
+ * that are not configured with an ingress qdisc will bail
+ * out here.
+ */
+ if (!cl)
return skb;
-
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- switch (ing_filter(skb, rxq)) {
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+ qdisc_bstats_update_cpu(cl->q, skb);
+
+ switch (tc_classify(skb, cl, &cl_res)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
case TC_ACT_SHOT:
+ qdisc_qstats_drop_cpu(cl->q);
case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
kfree_skb(skb);
return NULL;
+ default:
+ break;
}
-
+#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
-#endif
/**
* netdev_rx_handler_register - register receive handler
@@ -3652,6 +3737,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
}
}
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (nf_hook_ingress_active(skb)) {
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ return nf_hook_ingress(skb);
+ }
+#endif /* CONFIG_NETFILTER_INGRESS */
+ return 0;
+}
+
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
@@ -3711,13 +3812,17 @@ another_round:
}
skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto unlock;
- }
+ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ goto unlock;
+ }
+#endif
+#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
ncls:
#endif
@@ -5209,7 +5314,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
return -EBUSY;
- if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
+ if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
@@ -6320,6 +6425,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
return 0;
}
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+ netif_tx_stop_queue(txq);
+ }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -6869,6 +6985,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
+
+ nf_hook_ingress_init(dev);
+
return dev;
free_all: