diff options
author | Tariq Toukan | 2016-09-15 15:08:36 +0200 |
---|---|---|
committer | David S. Miller | 2016-09-17 15:51:40 +0200 |
commit | 7e426671704d2266757dff9c4254b788561aa11e (patch) | |
tree | 0fe0c74ea07692bc6d1c72e8483b1fdaf0fec7a8 /drivers/net/ethernet/mellanox/mlx5/core/en.h | |
parent | rxrpc: Make IPv6 support conditional on CONFIG_IPV6 (diff) | |
download | kernel-qcow2-linux-7e426671704d2266757dff9c4254b788561aa11e.tar.gz kernel-qcow2-linux-7e426671704d2266757dff9c4254b788561aa11e.tar.xz kernel-qcow2-linux-7e426671704d2266757dff9c4254b788561aa11e.zip |
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/en.h')
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en.h | 54 |
1 files changed, 15 insertions, 39 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index a9358cf7386a..401b2f7b165f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -62,12 +62,12 @@ #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE 0xd #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW 0x1 -#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW 0x4 +#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW 0x3 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW 0x6 #define MLX5_MPWRQ_LOG_STRIDE_SIZE 6 /* >= 6, HW restriction */ #define MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS 8 /* >= 6, HW restriction */ -#define MLX5_MPWRQ_LOG_WQE_SZ 17 +#define MLX5_MPWRQ_LOG_WQE_SZ 18 #define MLX5_MPWRQ_WQE_PAGE_ORDER (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \ MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0) #define MLX5_MPWRQ_PAGES_PER_WQE BIT(MLX5_MPWRQ_WQE_PAGE_ORDER) @@ -293,8 +293,8 @@ struct mlx5e_rq { u32 wqe_sz; struct sk_buff **skb; struct mlx5e_mpw_info *wqe_info; + void *mtt_no_align; __be32 mkey_be; - __be32 umr_mkey_be; struct device *pdev; struct net_device *netdev; @@ -323,32 +323,15 @@ struct mlx5e_rq { struct mlx5e_umr_dma_info { __be64 *mtt; - __be64 *mtt_no_align; dma_addr_t mtt_addr; - struct mlx5e_dma_info *dma_info; + struct mlx5e_dma_info dma_info[MLX5_MPWRQ_PAGES_PER_WQE]; + struct mlx5e_umr_wqe wqe; }; struct mlx5e_mpw_info { - union { - struct mlx5e_dma_info dma_info; - struct mlx5e_umr_dma_info umr; - }; + struct mlx5e_umr_dma_info umr; u16 consumed_strides; u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE]; - - void (*dma_pre_sync)(struct device *pdev, - struct mlx5e_mpw_info *wi, - u32 wqe_offset, u32 len); - void (*add_skb_frag)(struct mlx5e_rq *rq, - struct sk_buff *skb, - struct mlx5e_mpw_info *wi, - u32 page_idx, u32 frag_offset, u32 len); - void (*copy_skb_header)(struct device *pdev, - struct sk_buff *skb, - struct mlx5e_mpw_info *wi, - u32 page_idx, u32 offset, - u32 headlen); - void (*free_wqe)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi); }; struct mlx5e_tx_wqe_info { @@ -672,24 +655,11 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq); int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix); -int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix); +int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix); void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix); void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix); -void mlx5e_post_rx_fragmented_mpwqe(struct mlx5e_rq *rq); -void mlx5e_complete_rx_linear_mpwqe(struct mlx5e_rq *rq, - struct mlx5_cqe64 *cqe, - u16 byte_cnt, - struct mlx5e_mpw_info *wi, - struct sk_buff *skb); -void mlx5e_complete_rx_fragmented_mpwqe(struct mlx5e_rq *rq, - struct mlx5_cqe64 *cqe, - u16 byte_cnt, - struct mlx5e_mpw_info *wi, - struct sk_buff *skb); -void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq, - struct mlx5e_mpw_info *wi); -void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq, - struct mlx5e_mpw_info *wi); +void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq); +void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi); struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq); void mlx5e_rx_am(struct mlx5e_rq *rq); @@ -776,6 +746,12 @@ static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, NULL, cq->wq.cc); } +static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix) +{ + return rq->mpwqe_mtt_offset + + wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8); +} + static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) { return min_t(int, mdev->priv.eq_table.num_comp_vectors, |