bnx2x: uses build_skb() in receive path

bnx2x uses following formula to compute its rx_buf_sz : dev->mtu + 2*L1_CACHE_BYTES + 14 + 8 + 8 + 2 Then core network adds NET_SKB_PAD and SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) Final allocated size for skb head on x86_64 (L1_CACHE_BYTES = 64, MTU=1500) : 2112 bytes : SLUB/SLAB round this to 4096 bytes. Since skb truesize is then bigger than SK_MEM_QUANTUM, we have lot of false sharing because of mem_reclaim in UDP stack. One possible way to half truesize is to reduce the need by 64 bytes (2112 -> 2048 bytes) Instead of allocating a full cache line at the end of packet for alignment, we can use the fact that skb_shared_info sits at the end of skb->head, and we can use this room, if we convert bnx2x to new build_skb() infrastructure. skb_shared_info will be initialized after hardware finished its transfert, so we can eventually overwrite the final padding. Using build_skb() also reduces cache line misses in the driver, since we use cache hot skb instead of cold ones. Number of in-flight sk_buff structures is lower, they are recycled while still hot. Performance results : (820.000 pps on a rx UDP monothread benchmark, instead of 720.000 pps) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Eilon Greenstein <eilong@broadcom.com> CC: Ben Hutchings <bhutchings@solarflare.com> CC: Tom Herbert <therbert@google.com> CC: Jamal Hadi Salim <hadi@mojatatu.com> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Thomas Graf <tgraf@infradead.org> CC: Herbert Xu <herbert@gondor.apana.org.au> CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com> Acked-by: Eilon Greenstein <eilong@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet 2011-11-14 07:05:34 +0100
committer: David S. Miller 2011-11-14 20:13:30 +0100
commit: e52fcb2462ac484e6dd6e68869536609f0216938 (patch)
tree: 794407ff242481d5d09e3e669cabfe129d80f6df /drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
parent: net: introduce build_skb() (diff)
download: kernel-qcow2-linux-e52fcb2462ac484e6dd6e68869536609f0216938.tar.gz
kernel-qcow2-linux-e52fcb2462ac484e6dd6e68869536609f0216938.tar.xz
kernel-qcow2-linux-e52fcb2462ac484e6dd6e68869536609f0216938.zip
1 files changed, 126 insertions, 141 deletions
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 13dad9230dbc..0d60b9e633ad 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -294,8 +294,21 @@ static void bnx2x_update_sge_prod(struct bnx2x_fastpath *fp,
 	   fp->last_max_sge, fp->rx_sge_prod);
 }
 
+/* Set Toeplitz hash value in the skb using the value from the
+ * CQE (calculated by HW).
+ */
+static u32 bnx2x_get_rxhash(const struct bnx2x *bp,
+			    const struct eth_fast_path_rx_cqe *cqe)
+{
+	/* Set Toeplitz hash from CQE */
+	if ((bp->dev->features & NETIF_F_RXHASH) &&
+	    (cqe->status_flags & ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
+		return le32_to_cpu(cqe->rss_hash_result);
+	return 0;
+}
+
 static void bnx2x_tpa_start(struct bnx2x_fastpath *fp, u16 queue,
-			    struct sk_buff *skb, u16 cons, u16 prod,
+			    u16 cons, u16 prod,
 			    struct eth_fast_path_rx_cqe *cqe)
 {
 	struct bnx2x *bp = fp->bp;
@@ -310,9 +323,9 @@ static void bnx2x_tpa_start(struct bnx2x_fastpath *fp, u16 queue,
 	if (tpa_info->tpa_state != BNX2X_TPA_STOP)
 		BNX2X_ERR("start of bin not in stop [%d]\n", queue);
 
-	/* Try to map an empty skb from the aggregation info  */
+	/* Try to map an empty data buffer from the aggregation info  */
 	mapping = dma_map_single(&bp->pdev->dev,
-				 first_buf->skb->data,
+				 first_buf->data + NET_SKB_PAD,
 				 fp->rx_buf_size, DMA_FROM_DEVICE);
 	/*
 	 *  ...if it fails - move the skb from the consumer to the producer
@@ -322,15 +335,15 @@ static void bnx2x_tpa_start(struct bnx2x_fastpath *fp, u16 queue,
 
 	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		/* Move the BD from the consumer to the producer */
-		bnx2x_reuse_rx_skb(fp, cons, prod);
+		bnx2x_reuse_rx_data(fp, cons, prod);
 		tpa_info->tpa_state = BNX2X_TPA_ERROR;
 		return;
 	}
 
-	/* move empty skb from pool to prod */
-	prod_rx_buf->skb = first_buf->skb;
+	/* move empty data from pool to prod */
+	prod_rx_buf->data = first_buf->data;
 	dma_unmap_addr_set(prod_rx_buf, mapping, mapping);
-	/* point prod_bd to new skb */
+	/* point prod_bd to new data */
 	prod_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 	prod_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
 
@@ -344,6 +357,7 @@ static void bnx2x_tpa_start(struct bnx2x_fastpath *fp, u16 queue,
 	tpa_info->tpa_state = BNX2X_TPA_START;
 	tpa_info->len_on_bd = le16_to_cpu(cqe->len_on_bd);
 	tpa_info->placement_offset = cqe->placement_offset;
+	tpa_info->rxhash = bnx2x_get_rxhash(bp, cqe);
 
 #ifdef BNX2X_STOP_ON_ERROR
 	fp->tpa_queue_used |= (1 << queue);
@@ -471,11 +485,12 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 {
 	struct bnx2x_agg_info *tpa_info = &fp->tpa_info[queue];
 	struct sw_rx_bd *rx_buf = &tpa_info->first_buf;
-	u8 pad = tpa_info->placement_offset;
+	u32 pad = tpa_info->placement_offset;
 	u16 len = tpa_info->len_on_bd;
-	struct sk_buff *skb = rx_buf->skb;
+	struct sk_buff *skb = NULL;
+	u8 *data = rx_buf->data;
 	/* alloc new skb */
-	struct sk_buff *new_skb;
+	u8 *new_data;
 	u8 old_tpa_state = tpa_info->tpa_state;
 
 	tpa_info->tpa_state = BNX2X_TPA_STOP;
@@ -486,18 +501,18 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	if (old_tpa_state == BNX2X_TPA_ERROR)
 		goto drop;
 
-	/* Try to allocate the new skb */
-	new_skb = netdev_alloc_skb(bp->dev, fp->rx_buf_size);
+	/* Try to allocate the new data */
+	new_data = kmalloc(fp->rx_buf_size + NET_SKB_PAD, GFP_ATOMIC);
 
 	/* Unmap skb in the pool anyway, as we are going to change
 	   pool entry status to BNX2X_TPA_STOP even if new skb allocation
 	   fails. */
 	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
 			 fp->rx_buf_size, DMA_FROM_DEVICE);
+	if (likely(new_data))
+		skb = build_skb(data);
 
-	if (likely(new_skb)) {
-		prefetch(skb);
-		prefetch(((char *)(skb)) + L1_CACHE_BYTES);
+	if (likely(skb)) {
 
 #ifdef BNX2X_STOP_ON_ERROR
 		if (pad + len > fp->rx_buf_size) {
@@ -509,8 +524,9 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 		}
 #endif
 
-		skb_reserve(skb, pad);
+		skb_reserve(skb, pad + NET_SKB_PAD);
 		skb_put(skb, len);
+		skb->rxhash = tpa_info->rxhash;
 
 		skb->protocol = eth_type_trans(skb, bp->dev);
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -526,8 +542,8 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 		}
 
 
-		/* put new skb in bin */
-		rx_buf->skb = new_skb;
+		/* put new data in bin */
+		rx_buf->data = new_data;
 
 		return;
 	}
@@ -539,19 +555,6 @@ drop:
 	fp->eth_q_stats.rx_skb_alloc_failed++;
 }
 
-/* Set Toeplitz hash value in the skb using the value from the
- * CQE (calculated by HW).
- */
-static inline void bnx2x_set_skb_rxhash(struct bnx2x *bp, union eth_rx_cqe *cqe,
-					struct sk_buff *skb)
-{
-	/* Set Toeplitz hash from CQE */
-	if ((bp->dev->features & NETIF_F_RXHASH) &&
-	    (cqe->fast_path_cqe.status_flags &
-	     ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
-		skb->rxhash =
-		le32_to_cpu(cqe->fast_path_cqe.rss_hash_result);
-}
 
 int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 {
@@ -594,6 +597,7 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 		u8 cqe_fp_flags;
 		enum eth_rx_cqe_type cqe_fp_type;
 		u16 len, pad;
+		u8 *data;
 
 #ifdef BNX2X_STOP_ON_ERROR
 		if (unlikely(bp->panic))
@@ -604,13 +608,6 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 		bd_prod = RX_BD(bd_prod);
 		bd_cons = RX_BD(bd_cons);
 
-		/* Prefetch the page containing the BD descriptor
-		   at producer's index. It will be needed when new skb is
-		   allocated */
-		prefetch((void *)(PAGE_ALIGN((unsigned long)
-					     (&fp->rx_desc_ring[bd_prod])) -
-				  PAGE_SIZE + 1));
-
 		cqe = &fp->rx_comp_ring[comp_ring_cons];
 		cqe_fp = &cqe->fast_path_cqe;
 		cqe_fp_flags = cqe_fp->type_error_flags;
@@ -626,125 +623,110 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 		if (unlikely(CQE_TYPE_SLOW(cqe_fp_type))) {
 			bnx2x_sp_event(fp, cqe);
 			goto next_cqe;
+		}
+		rx_buf = &fp->rx_buf_ring[bd_cons];
+		data = rx_buf->data;
 
-		/* this is an rx packet */
-		} else {
-			rx_buf = &fp->rx_buf_ring[bd_cons];
-			skb = rx_buf->skb;
-			prefetch(skb);
-
-			if (!CQE_TYPE_FAST(cqe_fp_type)) {
+		if (!CQE_TYPE_FAST(cqe_fp_type)) {
 #ifdef BNX2X_STOP_ON_ERROR
-				/* sanity check */
-				if (fp->disable_tpa &&
-				    (CQE_TYPE_START(cqe_fp_type) ||
-				     CQE_TYPE_STOP(cqe_fp_type)))
-					BNX2X_ERR("START/STOP packet while "
-						  "disable_tpa type %x\n",
-						  CQE_TYPE(cqe_fp_type));
+			/* sanity check */
+			if (fp->disable_tpa &&
+			    (CQE_TYPE_START(cqe_fp_type) ||
+			     CQE_TYPE_STOP(cqe_fp_type)))
+				BNX2X_ERR("START/STOP packet while "
+					  "disable_tpa type %x\n",
+					  CQE_TYPE(cqe_fp_type));
 #endif
 
-				if (CQE_TYPE_START(cqe_fp_type)) {
-					u16 queue = cqe_fp->queue_index;
-					DP(NETIF_MSG_RX_STATUS,
-					   "calling tpa_start on queue %d\n",
-					   queue);
-
-					bnx2x_tpa_start(fp, queue, skb,
-							bd_cons, bd_prod,
-							cqe_fp);
-
-					/* Set Toeplitz hash for LRO skb */
-					bnx2x_set_skb_rxhash(bp, cqe, skb);
-
-					goto next_rx;
-
-				} else {
-					u16 queue =
-						cqe->end_agg_cqe.queue_index;
-					DP(NETIF_MSG_RX_STATUS,
-					   "calling tpa_stop on queue %d\n",
-					   queue);
+			if (CQE_TYPE_START(cqe_fp_type)) {
+				u16 queue = cqe_fp->queue_index;
+				DP(NETIF_MSG_RX_STATUS,
+				   "calling tpa_start on queue %d\n",
+				   queue);
 
-					bnx2x_tpa_stop(bp, fp, queue,
-						       &cqe->end_agg_cqe,
-						       comp_ring_cons);
+				bnx2x_tpa_start(fp, queue,
+						bd_cons, bd_prod,
+						cqe_fp);
+				goto next_rx;
+			} else {
+				u16 queue =
+					cqe->end_agg_cqe.queue_index;
+				DP(NETIF_MSG_RX_STATUS,
+				   "calling tpa_stop on queue %d\n",
+				   queue);
+
+				bnx2x_tpa_stop(bp, fp, queue,
+					       &cqe->end_agg_cqe,
+					       comp_ring_cons);
 #ifdef BNX2X_STOP_ON_ERROR
-					if (bp->panic)
-						return 0;
+				if (bp->panic)
+					return 0;
 #endif
 
-					bnx2x_update_sge_prod(fp, cqe_fp);
-					goto next_cqe;
-				}
+				bnx2x_update_sge_prod(fp, cqe_fp);
+				goto next_cqe;
 			}
-			/* non TPA */
-			len = le16_to_cpu(cqe_fp->pkt_len);
-			pad = cqe_fp->placement_offset;
-			dma_sync_single_for_cpu(&bp->pdev->dev,
+		}
+		/* non TPA */
+		len = le16_to_cpu(cqe_fp->pkt_len);
+		pad = cqe_fp->placement_offset;
+		dma_sync_single_for_cpu(&bp->pdev->dev,
 					dma_unmap_addr(rx_buf, mapping),
-						       pad + RX_COPY_THRESH,
-						       DMA_FROM_DEVICE);
-			prefetch(((char *)(skb)) + L1_CACHE_BYTES);
+					pad + RX_COPY_THRESH,
+					DMA_FROM_DEVICE);
+		pad += NET_SKB_PAD;
+		prefetch(data + pad); /* speedup eth_type_trans() */
+		/* is this an error packet? */
+		if (unlikely(cqe_fp_flags & ETH_RX_ERROR_FALGS)) {
+			DP(NETIF_MSG_RX_ERR,
+			   "ERROR  flags %x  rx packet %u\n",
+			   cqe_fp_flags, sw_comp_cons);
+			fp->eth_q_stats.rx_err_discard_pkt++;
+			goto reuse_rx;
+		}
 
-			/* is this an error packet? */
-			if (unlikely(cqe_fp_flags & ETH_RX_ERROR_FALGS)) {
+		/* Since we don't have a jumbo ring
+		 * copy small packets if mtu > 1500
+		 */
+		if ((bp->dev->mtu > ETH_MAX_PACKET_SIZE) &&
+		    (len <= RX_COPY_THRESH)) {
+			skb = netdev_alloc_skb_ip_align(bp->dev, len);
+			if (skb == NULL) {
 				DP(NETIF_MSG_RX_ERR,
-				   "ERROR  flags %x  rx packet %u\n",
-				   cqe_fp_flags, sw_comp_cons);
-				fp->eth_q_stats.rx_err_discard_pkt++;
+				   "ERROR  packet dropped because of alloc failure\n");
+				fp->eth_q_stats.rx_skb_alloc_failed++;
 				goto reuse_rx;
 			}
-
-			/* Since we don't have a jumbo ring
-			 * copy small packets if mtu > 1500
-			 */
-			if ((bp->dev->mtu > ETH_MAX_PACKET_SIZE) &&
-			    (len <= RX_COPY_THRESH)) {
-				struct sk_buff *new_skb;
-
-				new_skb = netdev_alloc_skb(bp->dev, len + pad);
-				if (new_skb == NULL) {
-					DP(NETIF_MSG_RX_ERR,
-					   "ERROR  packet dropped "
-					   "because of alloc failure\n");
-					fp->eth_q_stats.rx_skb_alloc_failed++;
-					goto reuse_rx;
-				}
-
-				/* aligned copy */
-				skb_copy_from_linear_data_offset(skb, pad,
-						    new_skb->data + pad, len);
-				skb_reserve(new_skb, pad);
-				skb_put(new_skb, len);
-
-				bnx2x_reuse_rx_skb(fp, bd_cons, bd_prod);
-
-				skb = new_skb;
-
-			} else
-			if (likely(bnx2x_alloc_rx_skb(bp, fp, bd_prod) == 0)) {
+			memcpy(skb->data, data + pad, len);
+			bnx2x_reuse_rx_data(fp, bd_cons, bd_prod);
+		} else {
+			if (likely(bnx2x_alloc_rx_data(bp, fp, bd_prod) == 0)) {
 				dma_unmap_single(&bp->pdev->dev,
-					dma_unmap_addr(rx_buf, mapping),
+						 dma_unmap_addr(rx_buf, mapping),
 						 fp->rx_buf_size,
 						 DMA_FROM_DEVICE);
+				skb = build_skb(data);
+				if (unlikely(!skb)) {
+					kfree(data);
+					fp->eth_q_stats.rx_skb_alloc_failed++;
+					goto next_rx;
+				}
 				skb_reserve(skb, pad);
-				skb_put(skb, len);
-
 			} else {
 				DP(NETIF_MSG_RX_ERR,
 				   "ERROR  packet dropped because "
 				   "of alloc failure\n");
 				fp->eth_q_stats.rx_skb_alloc_failed++;
 reuse_rx:
-				bnx2x_reuse_rx_skb(fp, bd_cons, bd_prod);
+				bnx2x_reuse_rx_data(fp, bd_cons, bd_prod);
 				goto next_rx;
 			}
 
+			skb_put(skb, len);
 			skb->protocol = eth_type_trans(skb, bp->dev);
 
 			/* Set Toeplitz hash for a none-LRO skb */
-			bnx2x_set_skb_rxhash(bp, cqe, skb);
+			skb->rxhash = bnx2x_get_rxhash(bp, cqe_fp);
 
 			skb_checksum_none_assert(skb);
 
@@ -767,7 +749,7 @@ reuse_rx:
 
 
 next_rx:
-		rx_buf->skb = NULL;
+		rx_buf->data = NULL;
 
 		bd_cons = NEXT_RX_IDX(bd_cons);
 		bd_prod = NEXT_RX_IDX(bd_prod);
@@ -1013,9 +995,9 @@ void bnx2x_init_rx_rings(struct bnx2x *bp)
 				struct sw_rx_bd *first_buf =
 					&tpa_info->first_buf;
 
-				first_buf->skb = netdev_alloc_skb(bp->dev,
-						       fp->rx_buf_size);
-				if (!first_buf->skb) {
+				first_buf->data = kmalloc(fp->rx_buf_size + NET_SKB_PAD,
+							  GFP_ATOMIC);
+				if (!first_buf->data) {
 					BNX2X_ERR("Failed to allocate TPA "
 						  "skb pool for queue[%d] - "
 						  "disabling TPA on this "
@@ -1118,16 +1100,16 @@ static void bnx2x_free_rx_bds(struct bnx2x_fastpath *fp)
 
 	for (i = 0; i < NUM_RX_BD; i++) {
 		struct sw_rx_bd *rx_buf = &fp->rx_buf_ring[i];
-		struct sk_buff *skb = rx_buf->skb;
+		u8 *data = rx_buf->data;
 
-		if (skb == NULL)
+		if (data == NULL)
 			continue;
 		dma_unmap_single(&bp->pdev->dev,
 				 dma_unmap_addr(rx_buf, mapping),
 				 fp->rx_buf_size, DMA_FROM_DEVICE);
 
-		rx_buf->skb = NULL;
-		dev_kfree_skb(skb);
+		rx_buf->data = NULL;
+		kfree(data);
 	}
 }
 
@@ -1509,6 +1491,7 @@ static inline void bnx2x_set_rx_buf_size(struct bnx2x *bp)
 
 	for_each_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
+		u32 mtu;
 
 		/* Always use a mini-jumbo MTU for the FCoE L2 ring */
 		if (IS_FCOE_IDX(i))
@@ -1518,13 +1501,15 @@ static inline void bnx2x_set_rx_buf_size(struct bnx2x *bp)
 			 * IP_HEADER_ALIGNMENT_PADDING to prevent a buffer
 			 * overrun attack.
 			 */
-			fp->rx_buf_size =
-				BNX2X_FCOE_MINI_JUMBO_MTU + ETH_OVREHEAD +
-				BNX2X_FW_RX_ALIGN + IP_HEADER_ALIGNMENT_PADDING;
+			mtu = BNX2X_FCOE_MINI_JUMBO_MTU;
 		else
-			fp->rx_buf_size =
-				bp->dev->mtu + ETH_OVREHEAD +
-				BNX2X_FW_RX_ALIGN + IP_HEADER_ALIGNMENT_PADDING;
+			mtu = bp->dev->mtu;
+		fp->rx_buf_size = BNX2X_FW_RX_ALIGN_START +
+				  IP_HEADER_ALIGNMENT_PADDING +
+				  ETH_OVREHEAD +
+				  mtu +
+				  BNX2X_FW_RX_ALIGN_END;
+		/* Note : rx_buf_size doesnt take into account NET_SKB_PAD */
 	}
 }
author	Eric Dumazet	2011-11-14 07:05:34 +0100
committer	David S. Miller	2011-11-14 20:13:30 +0100
commit	e52fcb2462ac484e6dd6e68869536609f0216938 (patch)
tree	794407ff242481d5d09e3e669cabfe129d80f6df /drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
parent	net: introduce build_skb() (diff)
download	kernel-qcow2-linux-e52fcb2462ac484e6dd6e68869536609f0216938.tar.gz kernel-qcow2-linux-e52fcb2462ac484e6dd6e68869536609f0216938.tar.xz kernel-qcow2-linux-e52fcb2462ac484e6dd6e68869536609f0216938.zip