diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/cmd.c | 9 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 69 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 1067 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/flow.c | 13 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/ib_rep.c | 39 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/ib_rep.h | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mad.c | 60 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 238 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 20 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 50 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 564 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 56 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 314 |
13 files changed, 2103 insertions, 400 deletions
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index e3ec79b8f7f5..6c8645033102 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -190,12 +190,12 @@ int mlx5_cmd_alloc_sw_icm(struct mlx5_dm *dm, int type, u64 length, u16 uid, phys_addr_t *addr, u32 *obj_id) { struct mlx5_core_dev *dev = dm->dev; - u32 num_blocks = DIV_ROUND_UP(length, MLX5_SW_ICM_BLOCK_SIZE(dev)); u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; u32 in[MLX5_ST_SZ_DW(create_sw_icm_in)] = {}; unsigned long *block_map; u64 icm_start_addr; u32 log_icm_size; + u32 num_blocks; u32 max_blocks; u64 block_idx; void *sw_icm; @@ -224,6 +224,8 @@ int mlx5_cmd_alloc_sw_icm(struct mlx5_dm *dm, int type, u64 length, return -EINVAL; } + num_blocks = (length + MLX5_SW_ICM_BLOCK_SIZE(dev) - 1) >> + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); max_blocks = BIT(log_icm_size - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); spin_lock(&dm->lock); block_idx = bitmap_find_next_zero_area(block_map, @@ -266,13 +268,16 @@ int mlx5_cmd_dealloc_sw_icm(struct mlx5_dm *dm, int type, u64 length, u16 uid, phys_addr_t addr, u32 obj_id) { struct mlx5_core_dev *dev = dm->dev; - u32 num_blocks = DIV_ROUND_UP(length, MLX5_SW_ICM_BLOCK_SIZE(dev)); u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; unsigned long *block_map; + u32 num_blocks; u64 start_idx; int err; + num_blocks = (length + MLX5_SW_ICM_BLOCK_SIZE(dev) - 1) >> + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); + switch (type) { case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: start_idx = diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 2e2e65f00257..45f48cde6b9d 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -37,7 +37,7 @@ #include "mlx5_ib.h" #include "srq.h" -static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) { struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; @@ -522,9 +522,9 @@ repoll: case MLX5_CQE_SIG_ERR: sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; - read_lock(&dev->mdev->priv.mkey_table.lock); - mmkey = __mlx5_mr_lookup(dev->mdev, - mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + xa_lock(&dev->mdev->priv.mkey_table); + mmkey = xa_load(&dev->mdev->priv.mkey_table, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); mr = to_mibmr(mmkey); get_sig_err_item(sig_err_cqe, &mr->sig->err_item); mr->sig->sig_err_exists = true; @@ -537,7 +537,7 @@ repoll: mr->sig->err_item.expected, mr->sig->err_item.actual); - read_unlock(&dev->mdev->priv.mkey_table.lock); + xa_unlock(&dev->mdev->priv.mkey_table); goto repoll; } @@ -884,14 +884,15 @@ static void notify_soft_wc_handler(struct work_struct *work) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } -struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_cq *cq; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; int uninitialized_var(index); int uninitialized_var(inlen); u32 *cqb = NULL; @@ -903,18 +904,14 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (entries < 0 || (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) - return ERR_PTR(-EINVAL); + return -EINVAL; if (check_cq_create_flags(attr->flags)) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; entries = roundup_pow_of_two(entries + 1); if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) - return ERR_PTR(-EINVAL); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); + return -EINVAL; cq->ibcq.cqe = entries - 1; mutex_init(&cq->resize_mutex); @@ -929,13 +926,13 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, &index, &inlen); if (err) - goto err_create; + return err; } else { cqe_size = cache_line_size() == 128 ? 128 : 64; err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, &index, &inlen); if (err) - goto err_create; + return err; INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } @@ -958,7 +955,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); - err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out)); if (err) goto err_cqb; @@ -980,7 +977,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, kvfree(cqb); - return &cq->ibcq; + return 0; err_cmd: mlx5_core_destroy_cq(dev->mdev, &cq->mcq); @@ -991,14 +988,10 @@ err_cqb: destroy_cq_user(cq, udata); else destroy_cq_kernel(dev, cq); - -err_create: - kfree(cq); - - return ERR_PTR(err); + return err; } -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); @@ -1008,10 +1001,6 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) destroy_cq_user(mcq, udata); else destroy_cq_kernel(dev, mcq); - - kfree(mcq); - - return 0; } static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) @@ -1137,11 +1126,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, return 0; } -static void un_resize_user(struct mlx5_ib_cq *cq) -{ - ib_umem_release(cq->resize_umem); -} - static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, int entries, int cqe_size) { @@ -1164,12 +1148,6 @@ ex: return err; } -static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) -{ - free_cq_buf(dev, cq->resize_buf); - cq->resize_buf = NULL; -} - static int copy_resize_cqes(struct mlx5_ib_cq *cq) { struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); @@ -1350,10 +1328,11 @@ ex_alloc: kvfree(in); ex_resize: - if (udata) - un_resize_user(cq); - else - un_resize_kernel(dev, cq); + ib_umem_release(cq->resize_umem); + if (!udata) { + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; + } ex: mutex_unlock(&cq->resize_mutex); return err; diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 80b42d069328..ec4370f99381 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -14,13 +14,17 @@ #include <linux/mlx5/driver.h> #include <linux/mlx5/fs.h> #include "mlx5_ib.h" +#include <linux/xarray.h> #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> +static void dispatch_event_fd(struct list_head *fd_list, const void *data); + enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, DEVX_OBJ_FLAGS_DCT = 1 << 1, + DEVX_OBJ_FLAGS_CQ = 1 << 2, }; struct devx_async_data { @@ -33,9 +37,61 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +struct devx_async_event_data { + struct list_head list; /* headed in ev_file->event_list */ + struct mlx5_ib_uapi_devx_async_event_hdr hdr; +}; + +/* first level XA value data structure */ +struct devx_event { + struct xarray object_ids; /* second XA level, Key = object id */ + struct list_head unaffiliated_list; +}; + +/* second level XA value data structure */ +struct devx_obj_event { + struct rcu_head rcu; + struct list_head obj_sub_list; +}; + +struct devx_event_subscription { + struct list_head file_list; /* headed in ev_file-> + * subscribed_events_list + */ + struct list_head xa_list; /* headed in devx_event->unaffiliated_list or + * devx_obj_event->obj_sub_list + */ + struct list_head obj_list; /* headed in devx_object */ + struct list_head event_list; /* headed in ev_file->event_list or in + * temp list via subscription + */ + + u8 is_cleaned:1; + u32 xa_key_level1; + u32 xa_key_level2; + struct rcu_head rcu; + u64 cookie; + struct devx_async_event_file *ev_file; + struct file *filp; /* Upon hot unplug we need a direct access to */ + struct eventfd_ctx *eventfd; +}; + +struct devx_async_event_file { + struct ib_uobject uobj; + /* Head of events that are subscribed to this FD */ + struct list_head subscribed_events_list; + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; + struct mlx5_ib_dev *dev; + u8 omit_data:1; + u8 is_overflow_err:1; + u8 is_destroyed:1; +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { - struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *ib_dev; u64 obj_id; u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; @@ -43,7 +99,9 @@ struct devx_obj { union { struct mlx5_ib_devx_mr devx_mr; struct mlx5_core_dct core_dct; + struct mlx5_core_cq core_cq; }; + struct list_head event_sub; /* holds devx_event_subscription entries */ }; struct devx_umem { @@ -149,6 +207,127 @@ bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id) return false; } +static bool is_legacy_unaffiliated_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PORT_CHANGE: + return true; + default: + return false; + } +} + +static bool is_legacy_obj_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_CQ_ERROR: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_DCT_DRAINED: + case MLX5_EVENT_TYPE_COMP: + return true; + default: + return false; + } +} + +static u16 get_legacy_obj_type(u16 opcode) +{ + switch (opcode) { + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_EVENT_QUEUE_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_EVENT_QUEUE_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_EVENT_QUEUE_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return 0; + } +} + +static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num) +{ + u16 opcode; + + opcode = (obj->obj_id >> 32) & 0xffff; + + if (is_legacy_obj_event_num(event_num)) + return get_legacy_obj_type(opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + return (obj->obj_id >> 48); + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_OBJ_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_OBJ_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_OBJ_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_OBJ_TYPE_DCT; + case MLX5_CMD_OP_CREATE_TIR: + return MLX5_OBJ_TYPE_TIR; + case MLX5_CMD_OP_CREATE_TIS: + return MLX5_OBJ_TYPE_TIS; + case MLX5_CMD_OP_CREATE_PSV: + return MLX5_OBJ_TYPE_PSV; + case MLX5_OBJ_TYPE_MKEY: + return MLX5_OBJ_TYPE_MKEY; + case MLX5_CMD_OP_CREATE_RMP: + return MLX5_OBJ_TYPE_RMP; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + return MLX5_OBJ_TYPE_XRC_SRQ; + case MLX5_CMD_OP_CREATE_XRQ: + return MLX5_OBJ_TYPE_XRQ; + case MLX5_CMD_OP_CREATE_RQT: + return MLX5_OBJ_TYPE_RQT; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + return MLX5_OBJ_TYPE_FLOW_COUNTER; + case MLX5_CMD_OP_CREATE_CQ: + return MLX5_OBJ_TYPE_CQ; + default: + return 0; + } +} + +static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe) +{ + switch (event_type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + return eqe->data.qp_srq.type; + case MLX5_EVENT_TYPE_CQ_ERROR: + return 0; + case MLX5_EVENT_TYPE_DCT_DRAINED: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return MLX5_GET(affiliated_event_header, &eqe->data, obj_type); + } +} + +static u32 get_dec_obj_id(u64 obj_id) +{ + return (obj_id & 0xffffffff); +} + /* * As the obj_id in the firmware is not globally unique the object type * must be considered upon checking for a valid object id. @@ -715,12 +894,16 @@ static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in) return c->devx_uid; } -static bool devx_is_general_cmd(void *in) + +static bool devx_is_general_cmd(void *in, struct mlx5_ib_dev *dev) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); - if (opcode >= MLX5_CMD_OP_GENERAL_START && - opcode < MLX5_CMD_OP_GENERAL_END) + /* Pass all cmds for vhca_tunnel as general, tracking is done in FW */ + if ((MLX5_CAP_GEN_64(dev->mdev, vhca_tunnel_commands) && + MLX5_GET(general_obj_in_cmd_hdr, in, vhca_tunnel_id)) || + (opcode >= MLX5_CMD_OP_GENERAL_START && + opcode < MLX5_CMD_OP_GENERAL_END)) return true; switch (opcode) { @@ -846,7 +1029,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( return uid; /* Only white list of some general HCA commands are allowed for this method. */ - if (!devx_is_general_cmd(cmd_in)) + if (!devx_is_general_cmd(cmd_in, dev)) return -EINVAL; cmd_out = uverbs_zalloc(attrs, cmd_out_len); @@ -1043,13 +1226,10 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj, struct mlx5_ib_dev *dev, void *in, void *out) { - struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr; - unsigned long flags; struct mlx5_core_mkey *mkey; void *mkc; u8 key; - int err; mkey = &devx_mr->mmkey; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -1062,11 +1242,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj, mkey->pd = MLX5_GET(mkc, mkc, pd); devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); - write_lock_irqsave(&table->lock, flags); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), - mkey); - write_unlock_irqrestore(&table->lock, flags); - return err; + return xa_err(xa_store(&dev->mdev->priv.mkey_table, + mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL)); } static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, @@ -1117,12 +1294,37 @@ static void devx_free_indirect_mkey(struct rcu_head *rcu) */ static void devx_cleanup_mkey(struct devx_obj *obj) { - struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table; - unsigned long flags; + xa_erase(&obj->ib_dev->mdev->priv.mkey_table, + mlx5_base_mkey(obj->devx_mr.mmkey.key)); +} - write_lock_irqsave(&table->lock, flags); - radix_tree_delete(&table->tree, mlx5_base_mkey(obj->devx_mr.mmkey.key)); - write_unlock_irqrestore(&table->lock, flags); +static void devx_cleanup_subscription(struct mlx5_ib_dev *dev, + struct devx_event_subscription *sub) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + if (sub->is_cleaned) + return; + + sub->is_cleaned = 1; + list_del_rcu(&sub->xa_list); + + if (list_empty(&sub->obj_list)) + return; + + list_del_rcu(&sub->obj_list); + /* check whether key level 1 for this obj_sub_list is empty */ + event = xa_load(&dev->devx_event_table.event_xa, + sub->xa_key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + sub->xa_key_level2); + kfree_rcu(xa_val_level2, rcu); + } } static int devx_obj_cleanup(struct ib_uobject *uobject, @@ -1130,24 +1332,34 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, struct uverbs_attr_bundle *attrs) { u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct mlx5_devx_event_table *devx_event_table; struct devx_obj *obj = uobject->object; + struct devx_event_subscription *sub_entry, *tmp; + struct mlx5_ib_dev *dev; int ret; + dev = mlx5_udata_to_mdev(&attrs->driver_udata); if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); if (obj->flags & DEVX_OBJ_FLAGS_DCT) - ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); + else if (obj->flags & DEVX_OBJ_FLAGS_CQ) + ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else - ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, - sizeof(out)); + ret = mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, + obj->dinlen, out, sizeof(out)); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { - struct mlx5_ib_dev *dev = - mlx5_udata_to_mdev(&attrs->driver_udata); + devx_event_table = &dev->devx_event_table; + + mutex_lock(&devx_event_table->event_xa_lock); + list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list) + devx_cleanup_subscription(dev, sub_entry); + mutex_unlock(&devx_event_table->event_xa_lock); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu, devx_free_indirect_mkey); return ret; @@ -1157,6 +1369,29 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, return ret; } +static void devx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) +{ + struct devx_obj *obj = container_of(mcq, struct devx_obj, core_cq); + struct mlx5_devx_event_table *table; + struct devx_event *event; + struct devx_obj_event *obj_event; + u32 obj_id = mcq->cqn; + + table = &obj->ib_dev->devx_event_table; + rcu_read_lock(); + event = xa_load(&table->event_xa, MLX5_EVENT_TYPE_COMP); + if (!event) + goto out; + + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) + goto out; + + dispatch_event_fd(&obj_event->obj_sub_list, eqe); +out: + rcu_read_unlock(); +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( struct uverbs_attr_bundle *attrs) { @@ -1179,6 +1414,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( u32 obj_id; u16 opcode; + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1208,6 +1446,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( err = mlx5_core_create_dct(dev->mdev, &obj->core_dct, cmd_in, cmd_in_len, cmd_out, cmd_out_len); + } else if (opcode == MLX5_CMD_OP_CREATE_CQ) { + obj->flags |= DEVX_OBJ_FLAGS_CQ; + obj->core_cq.comp = devx_cq_comp; + err = mlx5_core_create_cq(dev->mdev, &obj->core_cq, + cmd_in, cmd_in_len, cmd_out, + cmd_out_len); } else { err = mlx5_cmd_exec(dev->mdev, cmd_in, cmd_in_len, @@ -1218,7 +1462,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( goto obj_free; uobj->object = obj; - obj->mdev = dev->mdev; + INIT_LIST_HEAD(&obj->event_sub); + obj->ib_dev = dev; devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); @@ -1245,9 +1490,11 @@ err_copy: devx_cleanup_mkey(obj); obj_destroy: if (obj->flags & DEVX_OBJ_FLAGS_DCT) - mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); + else if (obj->flags & DEVX_OBJ_FLAGS_CQ) + mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else - mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, + mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); @@ -1269,6 +1516,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( int err; int uid; + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1311,6 +1561,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( int uid; struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1375,6 +1628,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( return 0; } +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE); + struct devx_async_event_file *ev_file; + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 flags; + int err; + + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA); + + if (err) + return err; + + ev_file = container_of(uobj, struct devx_async_event_file, + uobj); + spin_lock_init(&ev_file->lock); + INIT_LIST_HEAD(&ev_file->event_list); + init_waitqueue_head(&ev_file->poll_wait); + if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA) + ev_file->omit_data = 1; + INIT_LIST_HEAD(&ev_file->subscribed_events_list); + ev_file->dev = dev; + get_device(&dev->ib_dev.dev); + return 0; +} + static void devx_query_callback(int status, struct mlx5_async_work *context) { struct devx_async_data *async_data = @@ -1416,6 +1701,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( struct devx_async_cmd_event_file *ev_file; struct devx_async_data *async_data; + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1484,6 +1772,331 @@ sub_bytes: return err; } +static void +subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + /* Level 1 is valid for future use, no need to free */ + if (!is_level2) + return; + + event = xa_load(&devx_event_table->event_xa, key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, + key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + +static int +subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_obj_event *obj_event; + struct devx_event *event; + int err; + + event = xa_load(&devx_event_table->event_xa, key_level1); + if (!event) { + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + INIT_LIST_HEAD(&event->unaffiliated_list); + xa_init(&event->object_ids); + + err = xa_insert(&devx_event_table->event_xa, + key_level1, + event, + GFP_KERNEL); + if (err) { + kfree(event); + return err; + } + } + + if (!is_level2) + return 0; + + obj_event = xa_load(&event->object_ids, key_level2); + if (!obj_event) { + obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL); + if (!obj_event) + /* Level1 is valid for future use, no need to free */ + return -ENOMEM; + + err = xa_insert(&event->object_ids, + key_level2, + obj_event, + GFP_KERNEL); + if (err) + return err; + INIT_LIST_HEAD(&obj_event->obj_sub_list); + } + + return 0; +} + +static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + int i; + + for (i = 0; i < num_events; i++) { + if (obj) { + if (!is_legacy_obj_event_num(event_type_num_list[i])) + return false; + } else if (!is_legacy_unaffiliated_event_num( + event_type_num_list[i])) { + return false; + } + } + + return true; +} + +#define MAX_SUPP_EVENT_NUM 255 +static bool is_valid_events(struct mlx5_core_dev *dev, + int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + __be64 *aff_events; + __be64 *unaff_events; + int mask_entry; + int mask_bit; + int i; + + if (MLX5_CAP_GEN(dev, event_cap)) { + aff_events = MLX5_CAP_DEV_EVENT(dev, + user_affiliated_events); + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + } else { + return is_valid_events_legacy(num_events, event_type_num_list, + obj); + } + + for (i = 0; i < num_events; i++) { + if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM) + return false; + + mask_entry = event_type_num_list[i] / 64; + mask_bit = event_type_num_list[i] % 64; + + if (obj) { + /* CQ completion */ + if (event_type_num_list[i] == 0) + continue; + + if (!(be64_to_cpu(aff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + + continue; + } + + if (!(be64_to_cpu(unaff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + } + + return true; +} + +#define MAX_NUM_EVENTS 16 +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *devx_uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + struct ib_uobject *fd_uobj; + struct devx_obj *obj = NULL; + struct devx_async_event_file *ev_file; + struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table; + u16 *event_type_num_list; + struct devx_event_subscription *event_sub, *tmp_sub; + struct list_head sub_list; + int redirect_fd; + bool use_eventfd = false; + int num_events; + int num_alloc_xa_entries = 0; + u16 obj_type = 0; + u64 cookie = 0; + u32 obj_id = 0; + int err; + int i; + + if (!c->devx_uid) + return -EINVAL; + + if (!IS_ERR(devx_uobj)) { + obj = (struct devx_obj *)devx_uobj->object; + if (obj) + obj_id = get_dec_obj_id(obj->obj_id); + } + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_event_file, + uobj); + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) { + err = uverbs_copy_from(&redirect_fd, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM); + if (err) + return err; + + use_eventfd = true; + } + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) { + if (use_eventfd) + return -EINVAL; + + err = uverbs_copy_from(&cookie, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE); + if (err) + return err; + } + + num_events = uverbs_attr_ptr_get_array_size( + attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + sizeof(u16)); + + if (num_events < 0) + return num_events; + + if (num_events > MAX_NUM_EVENTS) + return -EINVAL; + + event_type_num_list = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST); + + if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj)) + return -EINVAL; + + INIT_LIST_HEAD(&sub_list); + + /* Protect from concurrent subscriptions to same XA entries to allow + * both to succeed + */ + mutex_lock(&devx_event_table->event_xa_lock); + for (i = 0; i < num_events; i++) { + u32 key_level1; + + if (obj) + obj_type = get_dec_obj_type(obj, + event_type_num_list[i]); + key_level1 = event_type_num_list[i] | obj_type << 16; + + err = subscribe_event_xa_alloc(devx_event_table, + key_level1, + obj, + obj_id); + if (err) + goto err; + + num_alloc_xa_entries++; + event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL); + if (!event_sub) + goto err; + + list_add_tail(&event_sub->event_list, &sub_list); + if (use_eventfd) { + event_sub->eventfd = + eventfd_ctx_fdget(redirect_fd); + + if (IS_ERR(event_sub)) { + err = PTR_ERR(event_sub->eventfd); + event_sub->eventfd = NULL; + goto err; + } + } + + event_sub->cookie = cookie; + event_sub->ev_file = ev_file; + event_sub->filp = fd_uobj->object; + /* May be needed upon cleanup the devx object/subscription */ + event_sub->xa_key_level1 = key_level1; + event_sub->xa_key_level2 = obj_id; + INIT_LIST_HEAD(&event_sub->obj_list); + } + + /* Once all the allocations and the XA data insertions were done we + * can go ahead and add all the subscriptions to the relevant lists + * without concern of a failure. + */ + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + struct devx_event *event; + struct devx_obj_event *obj_event; + + list_del_init(&event_sub->event_list); + + spin_lock_irq(&ev_file->lock); + list_add_tail_rcu(&event_sub->file_list, + &ev_file->subscribed_events_list); + spin_unlock_irq(&ev_file->lock); + + event = xa_load(&devx_event_table->event_xa, + event_sub->xa_key_level1); + WARN_ON(!event); + + if (!obj) { + list_add_tail_rcu(&event_sub->xa_list, + &event->unaffiliated_list); + continue; + } + + obj_event = xa_load(&event->object_ids, obj_id); + WARN_ON(!obj_event); + list_add_tail_rcu(&event_sub->xa_list, + &obj_event->obj_sub_list); + list_add_tail_rcu(&event_sub->obj_list, + &obj->event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return 0; + +err: + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + list_del(&event_sub->event_list); + + subscribe_event_xa_dealloc(devx_event_table, + event_sub->xa_key_level1, + obj, + obj_id); + + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + + kfree(event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return err; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1631,6 +2244,203 @@ static int devx_umem_cleanup(struct ib_uobject *uobject, return 0; } +static bool is_unaffiliated_event(struct mlx5_core_dev *dev, + unsigned long event_type) +{ + __be64 *unaff_events; + int mask_entry; + int mask_bit; + + if (!MLX5_CAP_GEN(dev, event_cap)) + return is_legacy_unaffiliated_event_num(event_type); + + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + WARN_ON(event_type > MAX_SUPP_EVENT_NUM); + + mask_entry = event_type / 64; + mask_bit = event_type % 64; + + if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit))) + return false; + + return true; +} + +static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data) +{ + struct mlx5_eqe *eqe = data; + u32 obj_id = 0; + + switch (event_type) { + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + break; + case MLX5_EVENT_TYPE_DCT_DRAINED: + obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + break; + case MLX5_EVENT_TYPE_CQ_ERROR: + obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff; + break; + default: + obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id); + break; + } + + return obj_id; +} + +static int deliver_event(struct devx_event_subscription *event_sub, + const void *data) +{ + struct devx_async_event_file *ev_file; + struct devx_async_event_data *event_data; + unsigned long flags; + + ev_file = event_sub->ev_file; + + if (ev_file->omit_data) { + spin_lock_irqsave(&ev_file->lock, flags); + if (!list_empty(&event_sub->event_list)) { + spin_unlock_irqrestore(&ev_file->lock, flags); + return 0; + } + + list_add_tail(&event_sub->event_list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + return 0; + } + + event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe), + GFP_ATOMIC); + if (!event_data) { + spin_lock_irqsave(&ev_file->lock, flags); + ev_file->is_overflow_err = 1; + spin_unlock_irqrestore(&ev_file->lock, flags); + return -ENOMEM; + } + + event_data->hdr.cookie = event_sub->cookie; + memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe)); + + spin_lock_irqsave(&ev_file->lock, flags); + list_add_tail(&event_data->list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + + return 0; +} + +static void dispatch_event_fd(struct list_head *fd_list, + const void *data) +{ + struct devx_event_subscription *item; + + list_for_each_entry_rcu(item, fd_list, xa_list) { + if (!get_file_rcu(item->filp)) + continue; + + if (item->eventfd) { + eventfd_signal(item->eventfd, 1); + fput(item->filp); + continue; + } + + deliver_event(item, data); + fput(item->filp); + } +} + +static int devx_event_notifier(struct notifier_block *nb, + unsigned long event_type, void *data) +{ + struct mlx5_devx_event_table *table; + struct mlx5_ib_dev *dev; + struct devx_event *event; + struct devx_obj_event *obj_event; + u16 obj_type = 0; + bool is_unaffiliated; + u32 obj_id; + + /* Explicit filtering to kernel events which may occur frequently */ + if (event_type == MLX5_EVENT_TYPE_CMD || + event_type == MLX5_EVENT_TYPE_PAGE_REQUEST) + return NOTIFY_OK; + + table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb); + dev = container_of(table, struct mlx5_ib_dev, devx_event_table); + is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type); + + if (!is_unaffiliated) + obj_type = get_event_obj_type(event_type, data); + + rcu_read_lock(); + event = xa_load(&table->event_xa, event_type | (obj_type << 16)); + if (!event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + if (is_unaffiliated) { + dispatch_event_fd(&event->unaffiliated_list, data); + rcu_read_unlock(); + return NOTIFY_OK; + } + + obj_id = devx_get_obj_id_from_event(event_type, data); + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + dispatch_event_fd(&obj_event->obj_sub_list, data); + + rcu_read_unlock(); + return NOTIFY_OK; +} + +void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_devx_event_table *table = &dev->devx_event_table; + + xa_init(&table->event_xa); + mutex_init(&table->event_xa_lock); + MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY); + mlx5_eq_notifier_register(dev->mdev, &table->devx_nb); +} + +void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_devx_event_table *table = &dev->devx_event_table; + struct devx_event_subscription *sub, *tmp; + struct devx_event *event; + void *entry; + unsigned long id; + + mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb); + mutex_lock(&dev->devx_event_table.event_xa_lock); + xa_for_each(&table->event_xa, id, entry) { + event = entry; + list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list, + xa_list) + devx_cleanup_subscription(dev, sub); + kfree(entry); + } + mutex_unlock(&dev->devx_event_table.event_xa_lock); + xa_destroy(&table->event_xa); +} + static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { @@ -1729,6 +2539,149 @@ static const struct file_operations devx_async_cmd_event_fops = { .llseek = no_llseek, }; +static ssize_t devx_async_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub; + struct devx_async_event_data *uninitialized_var(event); + int ret = 0; + size_t eventsz; + bool omit_data; + void *event_data; + + omit_data = ev_file->omit_data; + + spin_lock_irq(&ev_file->lock); + + if (ev_file->is_overflow_err) { + ev_file->is_overflow_err = 0; + spin_unlock_irq(&ev_file->lock); + return -EOVERFLOW; + } + + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + + while (list_empty(&ev_file->event_list)) { + spin_unlock_irq(&ev_file->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(ev_file->poll_wait, + (!list_empty(&ev_file->event_list) || + ev_file->is_destroyed))) { + return -ERESTARTSYS; + } + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + } + + if (omit_data) { + event_sub = list_first_entry(&ev_file->event_list, + struct devx_event_subscription, + event_list); + eventsz = sizeof(event_sub->cookie); + event_data = &event_sub->cookie; + } else { + event = list_first_entry(&ev_file->event_list, + struct devx_async_event_data, list); + eventsz = sizeof(struct mlx5_eqe) + + sizeof(struct mlx5_ib_uapi_devx_async_event_hdr); + event_data = &event->hdr; + } + + if (eventsz > count) { + spin_unlock_irq(&ev_file->lock); + return -EINVAL; + } + + if (omit_data) + list_del_init(&event_sub->event_list); + else + list_del(&event->list); + + spin_unlock_irq(&ev_file->lock); + + if (copy_to_user(buf, event_data, eventsz)) + /* This points to an application issue, not a kernel concern */ + ret = -EFAULT; + else + ret = eventsz; + + if (!omit_data) + kfree(event); + return ret; +} + +static __poll_t devx_async_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct devx_async_event_file *ev_file = filp->private_data; + __poll_t pollflags = 0; + + poll_wait(filp, &ev_file->poll_wait, wait); + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (!list_empty(&ev_file->event_list)) + pollflags = EPOLLIN | EPOLLRDNORM; + spin_unlock_irq(&ev_file->lock); + + return pollflags; +} + +static int devx_async_event_close(struct inode *inode, struct file *filp) +{ + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub, *event_sub_tmp; + struct devx_async_event_data *entry, *tmp; + + mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock); + /* delete the subscriptions which are related to this FD */ + list_for_each_entry_safe(event_sub, event_sub_tmp, + &ev_file->subscribed_events_list, file_list) { + devx_cleanup_subscription(ev_file->dev, event_sub); + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + + list_del_rcu(&event_sub->file_list); + /* subscription may not be used by the read API any more */ + kfree_rcu(event_sub, rcu); + } + + mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock); + + /* free the pending events allocation */ + if (!ev_file->omit_data) { + spin_lock_irq(&ev_file->lock); + list_for_each_entry_safe(entry, tmp, + &ev_file->event_list, list) + kfree(entry); /* read can't come any more */ + spin_unlock_irq(&ev_file->lock); + } + + uverbs_close_fd(filp); + put_device(&ev_file->dev->ib_dev.dev); + return 0; +} + +static const struct file_operations devx_async_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_event_read, + .poll = devx_async_event_poll, + .release = devx_async_event_close, + .llseek = no_llseek, +}; + static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { @@ -1748,6 +2701,21 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, return 0; }; +static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct devx_async_event_file *ev_file = + container_of(uobj, struct devx_async_event_file, + uobj); + + spin_lock_irq(&ev_file->lock); + ev_file->is_destroyed = 1; + spin_unlock_irq(&ev_file->lock); + + wake_up_interruptible(&ev_file->poll_wait); + return 0; +}; + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_DEVX_UMEM_REG, UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, @@ -1879,10 +2847,32 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + UVERBS_ATTR_MIN_SIZE(sizeof(u16)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), - &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN)); + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)); DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup), @@ -1913,6 +2903,24 @@ DECLARE_UVERBS_NAMED_OBJECT( O_RDONLY), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + enum mlx5_ib_uapi_devx_create_event_channel_flags, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file), + devx_hot_unplug_async_event_file, + &devx_async_event_fops, "[devx_async_event]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)); + static bool devx_is_supported(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -1933,5 +2941,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), {}, }; diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 1fc302d41a53..b8841355fcd5 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -65,11 +65,12 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct uverbs_attr_bundle *attrs) { - struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; + struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; struct ib_uobject **arr_flow_actions; struct ib_uflow_resources *uflow_res; + struct mlx5_flow_act flow_act = {}; void *devx_obj; int dest_id, dest_type; void *cmd_in; @@ -172,17 +173,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( arr_flow_actions[i]->object); } - ret = uverbs_copy_from(&flow_act.flow_tag, attrs, + ret = uverbs_copy_from(&flow_context.flow_tag, attrs, MLX5_IB_ATTR_CREATE_FLOW_TAG); if (!ret) { - if (flow_act.flow_tag >= BIT(24)) { + if (flow_context.flow_tag >= BIT(24)) { ret = -EINVAL; goto err_out; } - flow_act.flags |= FLOW_ACT_HAS_TAG; + flow_context.flags |= FLOW_CONTEXT_HAS_TAG; } - flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, + &flow_context, + &flow_act, counter_id, cmd_in, inlen, dest_id, dest_type); diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 269b24a3baa1..74ce9249e75a 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -14,9 +14,10 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) int vport_index; ibdev = mlx5_ib_get_uplink_ibdev(dev->priv.eswitch); - vport_index = ibdev->free_port++; + vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; + rep->rep_data[REP_IB].priv = ibdev; write_lock(&ibdev->port[vport_index].roce.netdev_lock); ibdev->port[vport_index].roce.netdev = mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport); @@ -28,7 +29,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) static int mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - int num_ports = MLX5_TOTAL_VPORTS(dev); + int num_ports = mlx5_eswitch_get_total_vports(dev); const struct mlx5_ib_profile *profile; struct mlx5_ib_dev *ibdev; int vport_index; @@ -50,7 +51,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) } ibdev->is_rep = true; - vport_index = ibdev->free_port++; + vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; ibdev->port[vport_index].roce.netdev = mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport); @@ -60,7 +61,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) if (!__mlx5_ib_add(ibdev, profile)) return -EINVAL; - rep->rep_if[REP_IB].priv = ibdev; + rep->rep_data[REP_IB].priv = ibdev; return 0; } @@ -68,15 +69,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) static void mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) { - struct mlx5_ib_dev *dev; + struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep); + struct mlx5_ib_port *port; - if (!rep->rep_if[REP_IB].priv || - rep->vport != MLX5_VPORT_UPLINK) - return; + port = &dev->port[rep->vport_index]; + write_lock(&port->roce.netdev_lock); + port->roce.netdev = NULL; + write_unlock(&port->roce.netdev_lock); + rep->rep_data[REP_IB].priv = NULL; + port->rep = NULL; - dev = mlx5_ib_rep_to_dev(rep); - __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); - rep->rep_if[REP_IB].priv = NULL; + if (rep->vport == MLX5_VPORT_UPLINK) + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep) @@ -84,16 +88,17 @@ static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep) return mlx5_ib_rep_to_dev(rep); } +static const struct mlx5_eswitch_rep_ops rep_ops = { + .load = mlx5_ib_vport_rep_load, + .unload = mlx5_ib_vport_rep_unload, + .get_proto_dev = mlx5_ib_vport_get_proto_dev, +}; + void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev) { struct mlx5_eswitch *esw = mdev->priv.eswitch; - struct mlx5_eswitch_rep_if rep_if = {}; - - rep_if.load = mlx5_ib_vport_rep_load; - rep_if.unload = mlx5_ib_vport_rep_unload; - rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev; - mlx5_eswitch_register_vport_reps(esw, &rep_if, REP_IB); + mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB); } void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev) diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h index 8336e0517a5c..de43b423bafc 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.h +++ b/drivers/infiniband/hw/mlx5/ib_rep.h @@ -28,7 +28,7 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, #else /* CONFIG_MLX5_ESWITCH */ static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw) { - return SRIOV_NONE; + return MLX5_ESWITCH_NONE; } static inline @@ -72,6 +72,6 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, static inline struct mlx5_ib_dev *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep) { - return (struct mlx5_ib_dev *)rep->rep_if[REP_IB].priv; + return rep->rep_data[REP_IB].priv; } #endif /* __MLX5_IB_REP_H__ */ diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 6c529e6f3a01..348c1df69cdc 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -200,19 +200,33 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, +static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num, const struct ib_mad *in_mad, struct ib_mad *out_mad) { - int err; + struct mlx5_core_dev *mdev; + bool native_port = true; + u8 mdev_port_num; void *out_cnt; + int err; + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* Fail to get the native port, likely due to 2nd port is still + * unaffiliated. In such case default to 1st port and attached + * PF device. + */ + native_port = false; + mdev = dev->mdev; + mdev_port_num = 1; + } /* Declaring support of extended counters */ if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { struct ib_class_port_info cpi = {}; cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); - return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + goto done; } if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { @@ -221,11 +235,13 @@ static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); out_cnt = kvzalloc(sz, GFP_KERNEL); - if (!out_cnt) - return IB_MAD_RESULT_FAILURE; + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } err = mlx5_core_query_vport_counter(mdev, 0, 0, - port_num, out_cnt, sz); + mdev_port_num, out_cnt, sz); if (!err) pma_cnt_ext_assign(pma_cnt_ext, out_cnt); } else { @@ -234,20 +250,23 @@ static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); out_cnt = kvzalloc(sz, GFP_KERNEL); - if (!out_cnt) - return IB_MAD_RESULT_FAILURE; + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } - err = mlx5_core_query_ib_ppcnt(mdev, port_num, + err = mlx5_core_query_ib_ppcnt(mdev, mdev_port_num, out_cnt, sz); if (!err) pma_cnt_assign(pma_cnt, out_cnt); - } - + } kvfree(out_cnt); - if (err) - return IB_MAD_RESULT_FAILURE; - - return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + err = err ? IB_MAD_RESULT_FAILURE : + IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +done: + if (native_port) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; } int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -259,8 +278,6 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct mlx5_ib_dev *dev = to_mdev(ibdev); const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - struct mlx5_core_dev *mdev; - u8 mdev_port_num; int ret; if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || @@ -269,19 +286,14 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, memset(out_mad->data, 0, sizeof(out_mad->data)); - mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); - if (!mdev) - return IB_MAD_RESULT_FAILURE; - - if (MLX5_CAP_GEN(mdev, vport_counters) && + if (MLX5_CAP_GEN(dev->mdev, vport_counters) && in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { - ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad); + ret = process_pma_cmd(dev, port_num, in_mad, out_mad); } else { ret = process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); } - mlx5_ib_put_native_port_mdev(dev, port_num); return ret; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index abac70ad5c7c..c2a5780cb394 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -52,6 +52,7 @@ #include <linux/mlx5/port.h> #include <linux/mlx5/vport.h> #include <linux/mlx5/fs.h> +#include <linux/mlx5/eswitch.h> #include <linux/list.h> #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> @@ -888,7 +889,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { - props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | @@ -1008,6 +1009,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); + props->max_pi_fast_reg_page_list_len = + props->max_fast_reg_page_list_len / 2; get_atomic_caps_qp(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); @@ -1043,15 +1046,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->tm_caps.flags = IB_TM_CAP_RC; props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } + if (MLX5_CAP_GEN(mdev, tag_matching) && + MLX5_CAP_GEN(mdev, rndv_offload_rc)) { + props->tm_caps.flags = IB_TM_CAP_RNDV_RC; + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + } + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { props->cq_caps.max_cq_moderation_count = MLX5_MAX_CQ_COUNT; @@ -2344,7 +2351,7 @@ static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, /* Allocation size must a multiple of the basic block size * and a power of 2. */ - act_size = roundup(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev)); + act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev)); act_size = roundup_pow_of_two(act_size); dm->size = act_size; @@ -2666,11 +2673,15 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, } } -static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, - u32 *match_v, const union ib_flow_spec *ib_spec, +static int parse_flow_attr(struct mlx5_core_dev *mdev, + struct mlx5_flow_spec *spec, + const union ib_flow_spec *ib_spec, const struct ib_flow_attr *flow_attr, struct mlx5_flow_act *action, u32 prev_type) { + struct mlx5_flow_context *flow_context = &spec->flow_context; + u32 *match_c = spec->match_criteria; + u32 *match_v = spec->match_value; void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, @@ -2989,8 +3000,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, if (ib_spec->flow_tag.tag_id >= BIT(24)) return -EINVAL; - action->flow_tag = ib_spec->flow_tag.tag_id; - action->flags |= FLOW_ACT_HAS_TAG; + flow_context->flow_tag = ib_spec->flow_tag.tag_id; + flow_context->flags |= FLOW_CONTEXT_HAS_TAG; break; case IB_FLOW_SPEC_ACTION_DROP: if (FIELDS_NOT_SUPPORTED(ib_spec->drop, @@ -3084,7 +3095,8 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev, return VALID_SPEC_NA; return is_crypto && is_ipsec && - (!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ? + (!egress || (!is_drop && + !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ? VALID_SPEC_VALID : VALID_SPEC_INVALID; } @@ -3252,11 +3264,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int max_table_size; int num_entries; int num_groups; + bool esw_encap; u32 flags = 0; int priority; max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); + esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != + DEVLINK_ESWITCH_ENCAP_MODE_NONE; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { enum mlx5_flow_namespace_type fn_type; @@ -3269,10 +3284,10 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, if (ft_type == MLX5_IB_FT_RX) { fn_type = MLX5_FLOW_NAMESPACE_BYPASS; prio = &dev->flow_db->prios[priority]; - if (!dev->is_rep && + if (!dev->is_rep && !esw_encap && MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; - if (!dev->is_rep && + if (!dev->is_rep && !esw_encap && MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, reformat_l3_tunnel_to_l2)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; @@ -3282,7 +3297,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, log_max_ft_size)); fn_type = MLX5_FLOW_NAMESPACE_EGRESS; prio = &dev->flow_db->egress_prios[priority]; - if (!dev->is_rep && + if (!dev->is_rep && !esw_encap && MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } @@ -3464,6 +3479,37 @@ free: return ret; } +static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev, + struct mlx5_flow_spec *spec, + struct mlx5_eswitch_rep *rep) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + void *misc; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, + rep->vport)); + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + + MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0); + } else { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + } +} + static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, const struct ib_flow_attr *flow_attr, @@ -3473,7 +3519,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; - struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; + struct mlx5_flow_act flow_act = {}; struct mlx5_flow_spec *spec; struct mlx5_flow_destination dest_arr[2] = {}; struct mlx5_flow_destination *rule_dst = dest_arr; @@ -3504,8 +3550,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, } for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { - err = parse_flow_attr(dev->mdev, spec->match_criteria, - spec->match_value, + err = parse_flow_attr(dev->mdev, spec, ib_flow, flow_attr, &flow_act, prev_type); if (err < 0) @@ -3519,19 +3564,15 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, set_underlay_qp(dev, spec, underlay_qpn); if (dev->is_rep) { - void *misc; + struct mlx5_eswitch_rep *rep; - if (!dev->port[flow_attr->port - 1].rep) { + rep = dev->port[flow_attr->port - 1].rep; + if (!rep) { err = -EINVAL; goto free; } - misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, - misc_parameters); - MLX5_SET(fte_match_set_misc, misc, source_port, - dev->port[flow_attr->port - 1].rep->vport); - misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, - misc_parameters); - MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + + mlx5_ib_set_rule_source_port(dev, spec, rep); } spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); @@ -3572,11 +3613,11 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; } - if ((flow_act.flags & FLOW_ACT_HAS_TAG) && + if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG) && (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n", - flow_act.flow_tag, flow_attr->type); + spec->flow_context.flow_tag, flow_attr->type); err = -EINVAL; goto free; } @@ -3892,6 +3933,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; int max_table_size = 0; + bool esw_encap; u32 flags = 0; int priority; @@ -3900,22 +3942,30 @@ _get_flow_table(struct mlx5_ib_dev *dev, else priority = ib_prio_to_core_prio(fs_matcher->priority, false); + esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != + DEVLINK_ESWITCH_ENCAP_MODE_NONE; if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); - if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - reformat_l3_tunnel_to_l2)) + reformat_l3_tunnel_to_l2) && + !esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) { max_table_size = BIT( MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size)); - if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) { max_table_size = BIT( MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size)); + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) && + esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; priority = FDB_BYPASS_PATH; } @@ -3947,6 +3997,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct mlx5_flow_destination *dst, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, void *cmd_in, int inlen, int dst_num) @@ -3969,6 +4020,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev, memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params, fs_matcher->mask_len); spec->match_criteria_enable = fs_matcher->match_criteria_enable; + spec->flow_context = *flow_context; handler->rule = mlx5_add_flow_rules(ft, spec, flow_act, dst, dst_num); @@ -4033,6 +4085,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_ib_flow_handler * mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, u32 counter_id, void *cmd_in, int inlen, int dest_id, @@ -4085,7 +4138,8 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, dst_num++; } - handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act, + handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, + flow_context, flow_act, cmd_in, inlen, dst_num); if (IS_ERR(handler)) { @@ -4457,7 +4511,7 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { - mcq->comp(mcq); + mcq->comp(mcq, NULL); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } @@ -4676,7 +4730,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) int err = -ENOMEM; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; - pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + pprops = kzalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; @@ -4690,7 +4744,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) goto out; } - memset(pprops, 0, sizeof(*pprops)); err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", @@ -4891,18 +4944,19 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) if (ret) goto error0; - devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); + devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq); + if (!devr->c0) { + ret = -ENOMEM; goto error1; } - devr->c0->device = &dev->ib_dev; - devr->c0->uobject = NULL; - devr->c0->comp_handler = NULL; - devr->c0->event_handler = NULL; - devr->c0->cq_context = NULL; + + devr->c0->device = &dev->ib_dev; atomic_set(&devr->c0->usecnt, 0); + ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL); + if (ret) + goto err_create_cq; + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); @@ -4994,6 +5048,8 @@ error3: mlx5_ib_dealloc_xrcd(devr->x0, NULL); error2: mlx5_ib_destroy_cq(devr->c0, NULL); +err_create_cq: + kfree(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0, NULL); error0: @@ -5012,6 +5068,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_xrcd(devr->x0, NULL); mlx5_ib_dealloc_xrcd(devr->x1, NULL); mlx5_ib_destroy_cq(devr->c0, NULL); + kfree(devr->c0); mlx5_ib_dealloc_pd(devr->p0, NULL); kfree(devr->p0); @@ -5424,7 +5481,8 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, struct mlx5_ib_port *port, - struct rdma_hw_stats *stats) + struct rdma_hw_stats *stats, + u16 set_id) { int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); void *out; @@ -5435,9 +5493,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - ret = mlx5_core_query_q_counter(mdev, - port->cnts.set_id, 0, - out, outlen); + ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen); if (ret) goto free; @@ -5497,7 +5553,8 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, port->cnts.num_ext_ppcnt_counters; /* q_counters are per IB device, query the master mdev */ - ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); + ret = mlx5_ib_query_q_counters(dev->mdev, port, stats, + port->cnts.set_id); if (ret) return ret; @@ -5533,6 +5590,68 @@ done: return num_counters; } +static struct rdma_hw_stats * +mlx5_ib_counter_alloc_stats(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_ib_port *port = &dev->port[counter->port - 1]; + + /* Q counters are in the beginning of all counters */ + return rdma_alloc_hw_stats_struct(port->cnts.names, + port->cnts.num_q_counters, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_ib_port *port = &dev->port[counter->port - 1]; + + return mlx5_ib_query_q_counters(dev->mdev, port, + counter->stats, counter->id); +} + +static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + u16 cnt_set_id = 0; + int err; + + if (!counter->id) { + err = mlx5_cmd_alloc_q_counter(dev->mdev, + &cnt_set_id, + MLX5_SHARED_RESOURCE_UID); + if (err) + return err; + counter->id = cnt_set_id; + } + + err = mlx5_ib_qp_set_counter(qp, counter); + if (err) + goto fail_set_counter; + + return 0; + +fail_set_counter: + mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id); + counter->id = 0; + + return err; +} + +static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) +{ + return mlx5_ib_qp_set_counter(qp, NULL); +} + +static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + + return mlx5_core_dealloc_q_counter(dev->mdev, counter->id); +} + static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params) @@ -6044,7 +6163,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->ib_dev.phys_port_cnt = dev->num_ports; @@ -6124,8 +6242,13 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) } static const struct ib_device_ops mlx5_ib_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_MLX5, + .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, + .add_gid = mlx5_ib_add_gid, .alloc_mr = mlx5_ib_alloc_mr, + .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, .alloc_pd = mlx5_ib_alloc_pd, .alloc_ucontext = mlx5_ib_alloc_ucontext, .attach_mcast = mlx5_ib_mcg_attach, @@ -6155,6 +6278,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .get_dma_mr = mlx5_ib_get_dma_mr, .get_link_layer = mlx5_ib_port_link_layer, .map_mr_sg = mlx5_ib_map_mr_sg, + .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, .mmap = mlx5_ib_mmap, .modify_cq = mlx5_ib_modify_cq, .modify_device = mlx5_ib_modify_device, @@ -6179,6 +6303,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .resize_cq = mlx5_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext), @@ -6221,7 +6346,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; int err; - dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -6290,7 +6414,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops); - dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) @@ -6305,6 +6428,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) mutex_init(&dev->lb.mutex); + dev->ib_dev.use_cq_dim = true; + return 0; } @@ -6452,6 +6577,11 @@ static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev) static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = { .alloc_hw_stats = mlx5_ib_alloc_hw_stats, .get_hw_stats = mlx5_ib_get_hw_stats, + .counter_bind_qp = mlx5_ib_counter_bind_qp, + .counter_unbind_qp = mlx5_ib_counter_unbind_qp, + .counter_dealloc = mlx5_ib_counter_dealloc, + .counter_alloc_stats = mlx5_ib_counter_alloc_stats, + .counter_update_stats = mlx5_ib_counter_update_stats, }; static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) @@ -6572,15 +6702,19 @@ static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev) int uid; uid = mlx5_ib_devx_create(dev, false); - if (uid > 0) + if (uid > 0) { dev->devx_whitelist_uid = uid; + mlx5_ib_devx_init_event_table(dev); + } return 0; } static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev) { - if (dev->devx_whitelist_uid) + if (dev->devx_whitelist_uid) { + mlx5_ib_devx_cleanup_event_table(dev); mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid); + } } void __mlx5_ib_remove(struct mlx5_ib_dev *dev, @@ -6779,7 +6913,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) printk_once(KERN_INFO "%s", mlx5_version); if (MLX5_ESWITCH_MANAGER(mdev) && - mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) { + mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) { if (!mlx5_core_mp_enabled(mdev)) mlx5_ib_register_vport_reps(mdev); return mdev; diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 9f90be296ee0..fe1a76d8531c 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -55,9 +55,10 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int i = 0; struct scatterlist *sg; int entry; - unsigned long page_shift = umem->page_shift; if (umem->is_odp) { + unsigned int page_shift = to_ib_umem_odp(umem)->page_shift; + *ncont = ib_umem_page_count(umem); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; @@ -67,15 +68,15 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, return; } - addr = addr >> page_shift; + addr = addr >> PAGE_SHIFT; tmp = (unsigned long)addr; m = find_first_bit(&tmp, BITS_PER_LONG); if (max_page_shift) - m = min_t(unsigned long, max_page_shift - page_shift, m); + m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m); for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> page_shift; - pfn = sg_dma_address(sg) >> page_shift; + len = sg_dma_len(sg) >> PAGE_SHIFT; + pfn = sg_dma_address(sg) >> PAGE_SHIFT; if (base + p != pfn) { /* If either the offset or the new * base are unaligned update m @@ -107,7 +108,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, *ncont = 0; } - *shift = page_shift + m; + *shift = PAGE_SHIFT + m; *count = i; } @@ -140,8 +141,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, size_t offset, size_t num_pages, __be64 *pas, int access_flags) { - unsigned long umem_page_shift = umem->page_shift; - int shift = page_shift - umem_page_shift; + int shift = page_shift - PAGE_SHIFT; int mask = (1 << shift) - 1; int i, k, idx; u64 cur = 0; @@ -165,7 +165,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> umem_page_shift; + len = sg_dma_len(sg) >> PAGE_SHIFT; base = sg_dma_address(sg); /* Skip elements below offset */ @@ -184,7 +184,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, for (; k < len; k++) { if (!(i & mask)) { - cur = base + (k << umem_page_shift); + cur = base + (k << PAGE_SHIFT); cur |= access_flags; idx = (i >> shift) - offset; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 40eb8be482e4..c482f19958b3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -431,9 +431,6 @@ struct mlx5_ib_qp { int create_type; - /* Store signature errors */ - bool signature_en; - struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; @@ -442,6 +439,10 @@ struct mlx5_ib_qp { u32 flags_en; /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ enum ib_qp_type qp_sub_type; + /* A flag to indicate if there's a new counter is configured + * but not take effective + */ + u32 counter_pending; }; struct mlx5_ib_cq_buf { @@ -587,6 +588,9 @@ struct mlx5_ib_mr { void *descs; dma_addr_t desc_map; int ndescs; + int data_length; + int meta_ndescs; + int meta_length; int max_descs; int desc_size; int access_mode; @@ -605,6 +609,13 @@ struct mlx5_ib_mr { int access_flags; /* Needed for rereg MR */ struct mlx5_ib_mr *parent; + /* Needed for IB_MR_TYPE_INTEGRITY */ + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr *klm_mr; + struct mlx5_ib_mr *mtt_mr; + u64 data_iova; + u64 pi_iova; + atomic_t num_leaf_free; wait_queue_head_t q_leaf_free; struct mlx5_async_work cb_work; @@ -920,6 +931,7 @@ struct mlx5_ib_lb_state { }; struct mlx5_ib_pf_eq { + struct notifier_block irq_nb; struct mlx5_ib_dev *dev; struct mlx5_eq *core; struct work_struct work; @@ -928,6 +940,13 @@ struct mlx5_ib_pf_eq { mempool_t *pool; }; +struct mlx5_devx_event_table { + struct mlx5_nb devx_nb; + /* serialize updating the event_xa */ + struct mutex event_xa_lock; + struct xarray event_xa; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -977,7 +996,7 @@ struct mlx5_ib_dev { u16 devx_whitelist_uid; struct mlx5_srq_table srq_table; struct mlx5_async_ctx async_ctx; - int free_port; + struct mlx5_devx_event_table devx_event_table; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1115,10 +1134,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int buflen, size_t *bc); int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, int buflen, size_t *bc); -struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); @@ -1148,8 +1166,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_sg, + u32 max_num_meta_sg); int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, @@ -1201,7 +1226,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); -int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); +void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, @@ -1311,11 +1336,14 @@ void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); +void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev); +void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev); const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type); bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); @@ -1327,6 +1355,8 @@ static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) { return -EOPNOTSUPP; } static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {} +static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {} +static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {} static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) { @@ -1442,4 +1472,6 @@ void mlx5_ib_put_xlt_emergency_page(void); int bfregn_to_uar_index(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, u32 bfregn, bool dyn_bfreg); + +int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 5f09699fab98..20ece6e0b2fc 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -130,7 +130,7 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; - struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; + struct xarray *mkeys = &dev->mdev->priv.mkey_table; int err; spin_lock_irqsave(&ent->lock, flags); @@ -158,12 +158,12 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context) ent->size++; spin_unlock_irqrestore(&ent->lock, flags); - write_lock_irqsave(&table->lock, flags); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey); + xa_lock_irqsave(mkeys, flags); + err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_ATOMIC)); + xa_unlock_irqrestore(mkeys, flags); if (err) pr_err("Error inserting to mkey tree. 0x%x\n", -err); - write_unlock_irqrestore(&table->lock, flags); if (!completion_done(&ent->compl)) complete(&ent->compl); @@ -1507,10 +1507,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, return 0; err: - if (mr->umem) { - ib_umem_release(mr->umem); - mr->umem = NULL; - } + ib_umem_release(mr->umem); + mr->umem = NULL; + clean_mr(dev, mr); return err; } @@ -1606,8 +1605,9 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ if (umem_odp->page_list) - mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + mlx5_ib_invalidate_range(umem_odp, + ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); else mlx5_ib_free_implicit_mr(mr); /* @@ -1629,28 +1629,85 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) * remove the DMA mapping. */ mlx5_mr_cache_free(dev, mr); - if (umem) { - ib_umem_release(umem); + ib_umem_release(umem); + if (umem) atomic_sub(npages, &dev->mdev->priv.reg_pages); - } + if (!mr->allocated_from_cache) kfree(mr); } int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { - dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr)); + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + + if (ibmr->type == IB_MR_TYPE_INTEGRITY) { + dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr); + dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); + } + + dereg_mr(to_mdev(ibmr->device), mmr); + return 0; } -struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, - u32 max_num_sg, struct ib_udata *udata) +static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, + int access_mode, int page_shift) +{ + void *mkc; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, log_page_size, page_shift); +} + +static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, int desc_size, int page_shift, + int access_mode, u32 *in, int inlen) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + int err; + + mr->access_mode = access_mode; + mr->desc_size = desc_size; + mr->max_descs = ndescs; + + err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); + if (err) + return err; + + mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); + if (err) + goto err_free_descs; + + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + + return 0; + +err_free_descs: + mlx5_free_priv_descs(mr); + return err; +} + +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, + u32 max_num_sg, u32 max_num_meta_sg, + int desc_size, int access_mode) +{ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); - int ndescs = ALIGN(max_num_sg, 4); + int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); + int page_shift = 0; struct mlx5_ib_mr *mr; - void *mkc; u32 *in; int err; @@ -1658,99 +1715,168 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, if (!mr) return ERR_PTR(-ENOMEM); + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } + if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) + page_shift = PAGE_SHIFT; + + err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, + access_mode, in, inlen); + if (err) + goto err_free_in; + + mr->umem = NULL; + kfree(in); + + return mr; + +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, u32 *in, int inlen) +{ + return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), + PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, + inlen); +} + +static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, u32 *in, int inlen) +{ + return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), + 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); +} + +static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int max_num_sg, int max_num_meta_sg, + u32 *in, int inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 psv_index[2]; + void *mkc; + int err; + + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) + return -ENOMEM; + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); + if (err) + goto err_free_sig; + + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, + sizeof(struct mlx5_klm), + MLX5_MKC_ACCESS_MODE_KLMS); + if (IS_ERR(mr->klm_mr)) { + err = PTR_ERR(mr->klm_mr); + goto err_destroy_psv; + } + mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, + sizeof(struct mlx5_mtt), + MLX5_MKC_ACCESS_MODE_MTT); + if (IS_ERR(mr->mtt_mr)) { + err = PTR_ERR(mr->mtt_mr); + goto err_free_klm_mr; + } + + /* Set bsf descriptors for mkey */ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - MLX5_SET(mkc, mkc, free, 1); - MLX5_SET(mkc, mkc, translations_octword_size, ndescs); - MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, bsf_en, 1); + MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); - if (mr_type == IB_MR_TYPE_MEM_REG) { - mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; - MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); - err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(struct mlx5_mtt)); - if (err) - goto err_free_in; + err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, + MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); + if (err) + goto err_free_mtt_mr; - mr->desc_size = sizeof(struct mlx5_mtt); - mr->max_descs = ndescs; - } else if (mr_type == IB_MR_TYPE_SG_GAPS) { - mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; + return 0; - err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(struct mlx5_klm)); - if (err) - goto err_free_in; - mr->desc_size = sizeof(struct mlx5_klm); - mr->max_descs = ndescs; - } else if (mr_type == IB_MR_TYPE_SIGNATURE) { - u32 psv_index[2]; - - MLX5_SET(mkc, mkc, bsf_en, 1); - MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); - mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); - if (!mr->sig) { - err = -ENOMEM; - goto err_free_in; - } +err_free_mtt_mr: + dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); + mr->mtt_mr = NULL; +err_free_klm_mr: + dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); + mr->klm_mr = NULL; +err_destroy_psv: + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); +err_free_sig: + kfree(mr->sig); - /* create mem & wire PSVs */ - err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, - 2, psv_index); - if (err) - goto err_free_sig; + return err; +} + +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, u32 max_num_sg, + u32 max_num_meta_sg) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int ndescs = ALIGN(max_num_sg, 4); + struct mlx5_ib_mr *mr; + u32 *in; + int err; - mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; - mr->sig->psv_memory.psv_idx = psv_index[0]; - mr->sig->psv_wire.psv_idx = psv_index[1]; + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); - mr->sig->sig_status_checked = true; - mr->sig->sig_err_exists = false; - /* Next UMR, Arm SIGERR */ - ++mr->sig->sigerr_count; - } else { + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mr->ibmr.device = pd->device; + mr->umem = NULL; + + switch (mr_type) { + case IB_MR_TYPE_MEM_REG: + err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); + break; + case IB_MR_TYPE_SG_GAPS: + err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); + break; + case IB_MR_TYPE_INTEGRITY: + err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, + max_num_meta_sg, in, inlen); + break; + default: mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); err = -EINVAL; - goto err_free_in; } - MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); - MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7); - MLX5_SET(mkc, mkc, umr_en, 1); - - mr->ibmr.device = pd->device; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) - goto err_destroy_psv; + goto err_free_in; - mr->mmkey.type = MLX5_MKEY_MR; - mr->ibmr.lkey = mr->mmkey.key; - mr->ibmr.rkey = mr->mmkey.key; - mr->umem = NULL; kfree(in); return &mr->ibmr; -err_destroy_psv: - if (mr->sig) { - if (mlx5_core_destroy_psv(dev->mdev, - mr->sig->psv_memory.psv_idx)) - mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", - mr->sig->psv_memory.psv_idx); - if (mlx5_core_destroy_psv(dev->mdev, - mr->sig->psv_wire.psv_idx)) - mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", - mr->sig->psv_wire.psv_idx); - } - mlx5_free_priv_descs(mr); -err_free_sig: - kfree(mr->sig); err_free_in: kfree(in); err_free: @@ -1758,6 +1884,19 @@ err_free: return ERR_PTR(err); } +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg, struct ib_udata *udata) +{ + return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); +} + +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_sg, u32 max_num_meta_sg) +{ + return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, + max_num_meta_sg); +} + struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata) { @@ -1887,16 +2026,53 @@ done: } static int +mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + unsigned int sg_offset = 0; + int n = 0; + + mr->meta_length = 0; + if (data_sg_nents == 1) { + n++; + mr->ndescs = 1; + if (data_sg_offset) + sg_offset = *data_sg_offset; + mr->data_length = sg_dma_len(data_sg) - sg_offset; + mr->data_iova = sg_dma_address(data_sg) + sg_offset; + if (meta_sg_nents == 1) { + n++; + mr->meta_ndescs = 1; + if (meta_sg_offset) + sg_offset = *meta_sg_offset; + else + sg_offset = 0; + mr->meta_length = sg_dma_len(meta_sg) - sg_offset; + mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; + } + ibmr->length = mr->data_length + mr->meta_length; + } + + return n; +} + +static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, unsigned short sg_nents, - unsigned int *sg_offset_p) + unsigned int *sg_offset_p, + struct scatterlist *meta_sgl, + unsigned short meta_sg_nents, + unsigned int *meta_sg_offset_p) { struct scatterlist *sg = sgl; struct mlx5_klm *klms = mr->descs; unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; u32 lkey = mr->ibmr.pd->local_dma_lkey; - int i; + int i, j = 0; mr->ibmr.iova = sg_dma_address(sg) + sg_offset; mr->ibmr.length = 0; @@ -1911,12 +2087,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, sg_offset = 0; } - mr->ndescs = i; if (sg_offset_p) *sg_offset_p = sg_offset; - return i; + mr->ndescs = i; + mr->data_length = mr->ibmr.length; + + if (meta_sg_nents) { + sg = meta_sgl; + sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; + for_each_sg(meta_sgl, sg, meta_sg_nents, j) { + if (unlikely(i + j >= mr->max_descs)) + break; + klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + + sg_offset); + klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - + sg_offset); + klms[i + j].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg) - sg_offset; + + sg_offset = 0; + } + if (meta_sg_offset_p) + *meta_sg_offset_p = sg_offset; + + mr->meta_ndescs = j; + mr->meta_length = mr->ibmr.length - mr->data_length; + } + + return i + j; } static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) @@ -1933,6 +2133,181 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) return 0; } +static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + __be64 *descs; + + if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs)) + return -ENOMEM; + + descs = mr->descs; + descs[mr->ndescs + mr->meta_ndescs++] = + cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); + + return 0; +} + +static int +mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->mtt_mr; + int n; + + pi_mr->ndescs = 0; + pi_mr->meta_ndescs = 0; + pi_mr->meta_length = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + pi_mr->ibmr.page_size = ibmr->page_size; + n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, + mlx5_set_page); + if (n != data_sg_nents) + return n; + + pi_mr->data_iova = pi_mr->ibmr.iova; + pi_mr->data_length = pi_mr->ibmr.length; + pi_mr->ibmr.length = pi_mr->data_length; + ibmr->length = pi_mr->data_length; + + if (meta_sg_nents) { + u64 page_mask = ~((u64)ibmr->page_size - 1); + u64 iova = pi_mr->data_iova; + + n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, + meta_sg_offset, mlx5_set_page_pi); + + pi_mr->meta_length = pi_mr->ibmr.length; + /* + * PI address for the HW is the offset of the metadata address + * relative to the first data page address. + * It equals to first data page address + size of data pages + + * metadata offset at the first metadata page + */ + pi_mr->pi_iova = (iova & page_mask) + + pi_mr->ndescs * ibmr->page_size + + (pi_mr->ibmr.iova & ~page_mask); + /* + * In order to use one MTT MR for data and metadata, we register + * also the gaps between the end of the data and the start of + * the metadata (the sig MR will verify that the HW will access + * to right addresses). This mapping is safe because we use + * internal mkey for the registration. + */ + pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; + pi_mr->ibmr.iova = iova; + ibmr->length += pi_mr->meta_length; + } + + ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + return n; +} + +static int +mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->klm_mr; + int n; + + pi_mr->ndescs = 0; + pi_mr->meta_ndescs = 0; + pi_mr->meta_length = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, + meta_sg, meta_sg_nents, meta_sg_offset); + + ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + /* This is zero-based memory region */ + pi_mr->data_iova = 0; + pi_mr->ibmr.iova = 0; + pi_mr->pi_iova = pi_mr->data_length; + ibmr->length = pi_mr->ibmr.length; + + return n; +} + +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = NULL; + int n; + + WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); + + mr->ndescs = 0; + mr->data_length = 0; + mr->data_iova = 0; + mr->meta_ndescs = 0; + mr->pi_iova = 0; + /* + * As a performance optimization, if possible, there is no need to + * perform UMR operation to register the data/metadata buffers. + * First try to map the sg lists to PA descriptors with local_dma_lkey. + * Fallback to UMR only in case of a failure. + */ + n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (n == data_sg_nents + meta_sg_nents) + goto out; + /* + * As a performance optimization, if possible, there is no need to map + * the sg lists to KLM descriptors. First try to map the sg lists to MTT + * descriptors and fallback to KLM only in case of a failure. + * It's more efficient for the HW to work with MTT descriptors + * (especially in high load). + * Use KLM (indirect access) only if it's mandatory. + */ + pi_mr = mr->mtt_mr; + n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (n == data_sg_nents + meta_sg_nents) + goto out; + + pi_mr = mr->klm_mr; + n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (unlikely(n != data_sg_nents + meta_sg_nents)) + return -ENOMEM; + +out: + /* This is zero-based memory region */ + ibmr->iova = 0; + mr->pi_mr = pi_mr; + if (pi_mr) + ibmr->sig_attrs->meta_length = pi_mr->meta_length; + else + ibmr->sig_attrs->meta_length = mr->meta_length; + + return 0; +} + int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { @@ -1946,7 +2321,8 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, DMA_TO_DEVICE); if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) - n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, + NULL); else n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx5_set_page); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 91507a2e9290..5b642d81e617 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -150,7 +150,7 @@ static struct ib_umem_odp *odp_lookup(u64 start, u64 length, if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(&odp->umem) > start + length) + if (ib_umem_start(odp) > start + length) goto not_found; } not_found: @@ -200,7 +200,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, static void mr_leaf_free_action(struct work_struct *work) { struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; + int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; mr->parent = NULL; @@ -224,7 +224,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; - struct ib_umem *umem; int in_block = 0; u64 addr; @@ -232,15 +231,14 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } - umem = &umem_odp->umem; mr = umem_odp->private; if (!mr || !mr->ibmr.pd) return; - start = max_t(u64, ib_umem_start(umem), start); - end = min_t(u64, ib_umem_end(umem), end); + start = max_t(u64, ib_umem_start(umem_odp), start); + end = min_t(u64, ib_umem_end(umem_odp), end); /* * Iteration one - zap the HW's MTTs. The notifiers_count ensures that @@ -249,8 +247,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, * but they will write 0s as well, so no difference in the end result. */ - for (addr = start; addr < end; addr += BIT(umem->page_shift)) { - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; + for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; /* * Strive to write the MTTs in chunks, but avoid overwriting * non-existing MTTs. The huristic here can be improved to @@ -544,13 +542,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - struct ib_umem *umem = &umem_odp->umem; if (mr->parent != imr) return 0; - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); if (umem_odp->dying) return 0; @@ -602,9 +599,9 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, } next_mr: - size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); + size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt); - page_shift = mr->umem->page_shift; + page_shift = odp->page_shift; page_mask = ~(BIT(page_shift) - 1); start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; @@ -768,7 +765,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, bcnt -= *bytes_committed; next_mr: - mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key)); + mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key)); if (!mkey_is_eq(mmkey, key)) { mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); ret = -EFAULT; @@ -1488,9 +1485,11 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) mlx5_eq_update_ci(eq->core, cc, 1); } -static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr) +static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type, + void *data) { - struct mlx5_ib_pf_eq *eq = eq_ptr; + struct mlx5_ib_pf_eq *eq = + container_of(nb, struct mlx5_ib_pf_eq, irq_nb); unsigned long flags; if (spin_trylock_irqsave(&eq->lock, flags)) { @@ -1553,20 +1552,26 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) goto err_mempool; } + eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int; param = (struct mlx5_eq_param) { - .index = MLX5_EQ_PFAULT_IDX, - .mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT, + .irq_index = 0, .nent = MLX5_IB_NUM_PF_EQE, - .context = eq, - .handler = mlx5_ib_eq_pf_int }; - eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", ¶m); + param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT; + eq->core = mlx5_eq_create_generic(dev->mdev, ¶m); if (IS_ERR(eq->core)) { err = PTR_ERR(eq->core); goto err_wq; } + err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb); + if (err) { + mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err); + goto err_eq; + } return 0; +err_eq: + mlx5_eq_destroy_generic(dev->mdev, eq->core); err_wq: destroy_workqueue(eq->wq); err_mempool: @@ -1579,6 +1584,7 @@ mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) { int err; + mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb); err = mlx5_eq_destroy_generic(dev->mdev, eq->core); cancel_work_sync(&eq->work); destroy_workqueue(eq->wq); @@ -1677,8 +1683,8 @@ static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; - mmkey = __mlx5_mr_lookup(dev->mdev, - mlx5_base_mkey(sg_list[i].lkey)); + mmkey = xa_load(&dev->mdev->priv.mkey_table, + mlx5_base_mkey(sg_list[i].lkey)); mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); atomic_dec(&mr->num_pending_prefetch); } @@ -1697,8 +1703,8 @@ static bool num_pending_prefetch_inc(struct ib_pd *pd, struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; - mmkey = __mlx5_mr_lookup(dev->mdev, - mlx5_base_mkey(sg_list[i].lkey)); + mmkey = xa_load(&dev->mdev->priv.mkey_table, + mlx5_base_mkey(sg_list[i].lkey)); if (!mmkey || mmkey->key != sg_list[i].lkey) { ret = false; break; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f6623c77443a..2a97619ed603 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -34,6 +34,7 @@ #include <rdma/ib_umem.h> #include <rdma/ib_cache.h> #include <rdma/ib_user_verbs.h> +#include <rdma/rdma_counter.h> #include <linux/mlx5/fs.h> #include "mlx5_ib.h" #include "ib_rep.h" @@ -442,9 +443,9 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) } size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN && ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) - return MLX5_SIG_WQE_SIZE; + return MLX5_SIG_WQE_SIZE; else return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); } @@ -496,9 +497,6 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) - qp->signature_en = true; - wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { @@ -790,8 +788,7 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, atomic_dec(&dev->delay_drop.rqs_cnt); mlx5_ib_db_unmap_user(context, &rwq->db); - if (rwq->umem) - ib_umem_release(rwq->umem); + ib_umem_release(rwq->umem); } static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, @@ -977,8 +974,7 @@ err_free: kvfree(*in); err_umem: - if (ubuffer->umem) - ib_umem_release(ubuffer->umem); + ib_umem_release(ubuffer->umem); err_bfreg: if (bfregn != MLX5_IB_INVALID_BFREG) @@ -997,8 +993,7 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, ibucontext); mlx5_ib_db_unmap_user(context, &qp->db); - if (base->ubuffer.umem) - ib_umem_release(base->ubuffer.umem); + ib_umem_release(base->ubuffer.umem); /* * Free only the BFREGs which are handled by the kernel. @@ -1042,7 +1037,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, void *qpc; int err; - if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_IPOIB_UD_LSO | IB_QP_CREATE_NETIF_QP | @@ -3386,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, return tx_port_affinity; } +static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, + struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_qp_context context = {}; + struct mlx5_ib_port *mibport = NULL; + struct mlx5_ib_qp_base *base; + u32 set_id; + + if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id)) + return 0; + + if (counter) { + set_id = counter->id; + } else { + mibport = &dev->port[mqp->port - 1]; + set_id = mibport->cnts.set_id; + } + + base = &mqp->trans_qp.base; + context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff); + context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24); + return mlx5_core_qp_modify(dev->mdev, + MLX5_CMD_OP_RTS2RTS_QP, + MLX5_QP_OPTPAR_COUNTER_SET_ID, + &context, &base->mqp); +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, @@ -3439,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_port *mibport = NULL; enum mlx5_qp_state mlx5_cur, mlx5_new; enum mlx5_qp_optpar optpar; + u32 set_id = 0; int mlx5_st; int err; u16 op; @@ -3601,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, port_num = 0; mibport = &dev->port[port_num]; + if (ibqp->counter) + set_id = ibqp->counter->id; + else + set_id = mibport->cnts.set_id; context->qp_counter_set_usr_page |= - cpu_to_be32((u32)(mibport->cnts.set_id) << 24); + cpu_to_be32(set_id << 24); } if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) @@ -3630,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, raw_qp_param.operation = op; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { - raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id; + raw_qp_param.rq_q_ctr_id = set_id; raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; } @@ -3707,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, qp->db.db[MLX5_SND_DBR] = 0; } + if ((new_state == IB_QPS_RTS) && qp->counter_pending) { + err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter); + if (!err) + qp->counter_pending = 0; + } + out: kfree(context); return err; @@ -4170,15 +4205,13 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr, bool umr_inline) + struct mlx5_ib_mr *mr, u8 flags) { - int size = mr->ndescs * mr->desc_size; + int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; memset(umr, 0, sizeof(*umr)); - umr->flags = MLX5_UMR_CHECK_NOT_FREE; - if (umr_inline) - umr->flags |= MLX5_UMR_INLINE; + umr->flags = flags; umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -4305,7 +4338,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, struct mlx5_ib_mr *mr, u32 key, int access) { - int ndescs = ALIGN(mr->ndescs, 8) >> 1; + int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); @@ -4356,7 +4389,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, struct mlx5_ib_mr *mr, struct mlx5_ib_pd *pd) { - int bcount = mr->desc_size * mr->ndescs; + int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs); dseg->addr = cpu_to_be64(mr->desc_map); dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); @@ -4549,23 +4582,37 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, return 0; } -static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, - struct mlx5_ib_qp *qp, void **seg, - int *size, void **cur_edge) +static int set_sig_data_segment(const struct ib_send_wr *send_wr, + struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) { - struct ib_sig_attrs *sig_attrs = wr->sig_attrs; - struct ib_mr *sig_mr = wr->sig_mr; struct mlx5_bsf *bsf; - u32 data_len = wr->wr.sg_list->length; - u32 data_key = wr->wr.sg_list->lkey; - u64 data_va = wr->wr.sg_list->addr; + u32 data_len; + u32 data_key; + u64 data_va; + u32 prot_len = 0; + u32 prot_key = 0; + u64 prot_va = 0; + bool prot = false; int ret; int wqe_size; + struct mlx5_ib_mr *mr = to_mmr(sig_mr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; + + data_len = pi_mr->data_length; + data_key = pi_mr->ibmr.lkey; + data_va = pi_mr->data_iova; + if (pi_mr->meta_ndescs) { + prot_len = pi_mr->meta_length; + prot_key = pi_mr->ibmr.lkey; + prot_va = pi_mr->pi_iova; + prot = true; + } - if (!wr->prot || - (data_key == wr->prot->lkey && - data_va == wr->prot->addr && - data_len == wr->prot->length)) { + if (!prot || (data_key == prot_key && data_va == prot_va && + data_len == prot_len)) { /** * Source domain doesn't contain signature information * or data and protection are interleaved in memory. @@ -4599,8 +4646,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, struct mlx5_stride_block_ctrl_seg *sblock_ctrl; struct mlx5_stride_block_entry *data_sentry; struct mlx5_stride_block_entry *prot_sentry; - u32 prot_key = wr->prot->lkey; - u64 prot_va = wr->prot->addr; u16 block_size = sig_attrs->mem.sig.dif.pi_interval; int prot_size; @@ -4650,17 +4695,15 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, } static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - const struct ib_sig_handover_wr *wr, u32 size, - u32 length, u32 pdn) + struct ib_mr *sig_mr, int access_flags, + u32 size, u32 length, u32 pdn) { - struct ib_mr *sig_mr = wr->sig_mr; u32 sig_key = sig_mr->rkey; u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; memset(seg, 0, sizeof(*seg)); - seg->flags = get_umr_flags(wr->access_flags) | - MLX5_MKC_ACCESS_MODE_KLMS; + seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS; seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); @@ -4680,49 +4723,50 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = sig_mkey_mask(); } - -static int set_sig_umr_wr(const struct ib_send_wr *send_wr, - struct mlx5_ib_qp *qp, void **seg, int *size, - void **cur_edge) +static int set_pi_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) { - const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); - struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); + const struct ib_reg_wr *wr = reg_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr); + struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr; + struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs; u32 pdn = get_pd(qp)->pdn; u32 xlt_size; int region_len, ret; - if (unlikely(wr->wr.num_sge != 1) || - unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + if (unlikely(send_wr->num_sge != 0) || + unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || unlikely(!sig_mr->sig->sig_status_checked)) return -EINVAL; /* length of the protected region, data + protection */ - region_len = wr->wr.sg_list->length; - if (wr->prot && - (wr->prot->lkey != wr->wr.sg_list->lkey || - wr->prot->addr != wr->wr.sg_list->addr || - wr->prot->length != wr->wr.sg_list->length)) - region_len += wr->prot->length; + region_len = pi_mr->ibmr.length; /** * KLM octoword size - if protection was provided * then we use strided block format (3 octowords), * else we use single KLM (1 octoword) **/ - xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm); + if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE) + xlt_size = 0x30; + else + xlt_size = sizeof(struct mlx5_klm); set_sig_umr_segment(*seg, xlt_size); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn); + set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len, + pdn); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - ret = set_sig_data_segment(wr, qp, seg, size, cur_edge); + ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size, + cur_edge); if (ret) return ret; @@ -4759,12 +4803,14 @@ static int set_psv_wr(struct ib_sig_domain *domain, static int set_reg_wr(struct mlx5_ib_qp *qp, const struct ib_reg_wr *wr, - void **seg, int *size, void **cur_edge) + void **seg, int *size, void **cur_edge, + bool check_not_free) { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); - size_t mr_list_size = mr->ndescs * mr->desc_size; + int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; + u8 flags = 0; if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), @@ -4772,7 +4818,12 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, return -EINVAL; } - set_reg_umr_seg(*seg, mr, umr_inline); + if (check_not_free) + flags |= MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + flags |= MLX5_UMR_INLINE; + + set_reg_umr_seg(*seg, mr, flags); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); @@ -4898,8 +4949,12 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; + struct ib_reg_wr reg_pi_wr; struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr pa_pi_mr; + struct ib_sig_attrs *sig_attrs; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; void *cur_edge; @@ -4953,7 +5008,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, goto out; } - if (wr->opcode == IB_WR_REG_MR) { + if (wr->opcode == IB_WR_REG_MR || + wr->opcode == IB_WR_REG_MR_INTEGRITY) { fence = dev->umr_fence; next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; } else { @@ -5003,7 +5059,7 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, qp->sq.wr_data[idx] = IB_WR_REG_MR; ctrl->imm = cpu_to_be32(reg_wr(wr)->key); err = set_reg_wr(qp, reg_wr(wr), &seg, &size, - &cur_edge); + &cur_edge, true); if (err) { *bad_wr = wr; goto out; @@ -5011,26 +5067,82 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, num_sge = 0; break; - case IB_WR_REG_SIG_MR: - qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; - mr = to_mmr(sig_handover_wr(wr)->sig_mr); - + case IB_WR_REG_MR_INTEGRITY: + qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY; + + mr = to_mmr(reg_wr(wr)->mr); + pi_mr = mr->pi_mr; + + if (pi_mr) { + memset(®_pi_wr, 0, + sizeof(struct ib_reg_wr)); + + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; + + ctrl->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + prot registration */ + err = set_reg_wr(qp, ®_pi_wr, &seg, + &size, &cur_edge, + false); + if (err) { + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, + cur_edge, idx, wr->wr_id, + nreq, fence, + MLX5_OPCODE_UMR); + + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, &cur_edge, + nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + } else { + memset(&pa_pi_mr, 0, + sizeof(struct mlx5_ib_mr)); + /* No UMR, use local_dma_lkey */ + pa_pi_mr.ibmr.lkey = + mr->ibmr.pd->local_dma_lkey; + + pa_pi_mr.ndescs = mr->ndescs; + pa_pi_mr.data_length = mr->data_length; + pa_pi_mr.data_iova = mr->data_iova; + if (mr->meta_ndescs) { + pa_pi_mr.meta_ndescs = + mr->meta_ndescs; + pa_pi_mr.meta_length = + mr->meta_length; + pa_pi_mr.pi_iova = mr->pi_iova; + } + + pa_pi_mr.ibmr.length = mr->ibmr.length; + mr->pi_mr = &pa_pi_mr; + } ctrl->imm = cpu_to_be32(mr->ibmr.rkey); - err = set_sig_umr_wr(wr, qp, &seg, &size, - &cur_edge); + /* UMR for sig MR */ + err = set_pi_umr_wr(wr, qp, &seg, &size, + &cur_edge); if (err) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq, fence, MLX5_OPCODE_UMR); + /* * SET_PSV WQEs are not signaled and solicited * on error */ + sig_attrs = mr->ibmr.sig_attrs; err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge, nreq, false, true); @@ -5040,19 +5152,18 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, *bad_wr = wr; goto out; } - - err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem, - mr->sig->psv_memory.psv_idx, &seg, - &size); + err = set_psv_wr(&sig_attrs->mem, + mr->sig->psv_memory.psv_idx, + &seg, &size); if (err) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, + wr->wr_id, nreq, next_fence, MLX5_OPCODE_SET_PSV); + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge, nreq, false, true); @@ -5062,20 +5173,20 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, *bad_wr = wr; goto out; } - - err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire, - mr->sig->psv_wire.psv_idx, &seg, - &size); + err = set_psv_wr(&sig_attrs->wire, + mr->sig->psv_wire.psv_idx, + &seg, &size); if (err) { mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, + wr->wr_id, nreq, next_fence, MLX5_OPCODE_SET_PSV); - qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + + qp->next_fence = + MLX5_FENCE_MODE_INITIATOR_SMALL; num_sge = 0; goto skip_psv; @@ -6047,7 +6158,7 @@ err: return ERR_PTR(err); } -int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) +void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(wq->device); struct mlx5_ib_rwq *rwq = to_mrwq(wq); @@ -6055,8 +6166,6 @@ int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); destroy_user_rq(dev, wq->pd, rwq, udata); kfree(rwq); - - return 0; } struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, @@ -6297,7 +6406,7 @@ static void handle_drain_completion(struct ib_cq *cq, /* Run the CQ handler - this makes sure that the drain WR will * be processed if wasn't processed yet. */ - mcq->mcq.comp(&mcq->mcq); + mcq->mcq.comp(&mcq->mcq, NULL); } wait_for_completion(&sdrain->done); @@ -6367,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp) handle_drain_completion(cq, &rdrain, dev); } + +/** + * Bind a qp to a counter. If @counter is NULL then bind the qp to + * the default counter + */ +int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) +{ + struct mlx5_ib_qp *mqp = to_mqp(qp); + int err = 0; + + mutex_lock(&mqp->mutex); + if (mqp->state == IB_QPS_RESET) { + qp->counter = counter; + goto out; + } + + if (mqp->state == IB_QPS_RTS) { + err = __mlx5_ib_qp_set_counter(qp, counter); + if (!err) + qp->counter = counter; + + goto out; + } + + mqp->counter_pending = 1; + qp->counter = counter; + +out: + mutex_unlock(&mqp->mutex); + return err; +} |