diff options
Diffstat (limited to 'drivers/infiniband/core')
33 files changed, 2929 insertions, 708 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index e3cdafff8ece..9c0a2b5c834e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o @@ -11,7 +12,8 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o + security.o nldev.o + ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o @@ -31,4 +33,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ - rdma_core.o uverbs_std_types.o + rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ + uverbs_ioctl_merge.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index a6cb379a4ebc..12523f630b61 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -61,6 +61,7 @@ struct addr_req { void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); unsigned long timeout; + struct delayed_work work; int status; u32 seq; }; @@ -129,13 +130,11 @@ static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) } int ib_nl_handle_ip_res_resp(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; - if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; if (ib_nl_is_good_ip_resp(nlh)) @@ -185,7 +184,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. @@ -268,6 +267,7 @@ int rdma_translate_ip(const struct sockaddr *addr, return ret; ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_addr->bound_dev_if = dev->ifindex; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); @@ -280,6 +280,7 @@ int rdma_translate_ip(const struct sockaddr *addr, &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_addr->bound_dev_if = dev->ifindex; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); break; @@ -293,7 +294,7 @@ int rdma_translate_ip(const struct sockaddr *addr, } EXPORT_SYMBOL(rdma_translate_ip); -static void set_timeout(unsigned long time) +static void set_timeout(struct delayed_work *delayed_work, unsigned long time) { unsigned long delay; @@ -301,7 +302,7 @@ static void set_timeout(unsigned long time) if ((long)delay < 0) delay = 0; - mod_delayed_work(addr_wq, &work, delay); + mod_delayed_work(addr_wq, delayed_work, delay); } static void queue_req(struct addr_req *req) @@ -316,15 +317,14 @@ static void queue_req(struct addr_req *req) list_add(&req->list, &temp_req->list); - if (req_list.next == &req->list) - set_timeout(req->timeout); + set_timeout(&req->work, req->timeout); mutex_unlock(&lock); } static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; /* We fill in what we can, the response will fill the rest */ @@ -405,10 +405,10 @@ static int addr4_resolve(struct sockaddr_in *src_in, fl4.saddr = src_ip; fl4.flowi4_oif = addr->bound_dev_if; rt = ip_route_output_key(addr->net, &fl4); - if (IS_ERR(rt)) { - ret = PTR_ERR(rt); - goto out; - } + ret = PTR_ERR_OR_ZERO(rt); + if (ret) + return ret; + src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; @@ -423,8 +423,6 @@ static int addr4_resolve(struct sockaddr_in *src_in, *prt = rt; return 0; -out: - return ret; } #if IS_ENABLED(CONFIG_IPV6) @@ -509,6 +507,11 @@ static int addr_resolve(struct sockaddr *src_in, struct dst_entry *dst; int ret; + if (!addr->net) { + pr_warn_ratelimited("%s: missing namespace\n", __func__); + return -EINVAL; + } + if (src_in->sa_family == AF_INET) { struct rtable *rt = NULL; const struct sockaddr_in *dst_in4 = @@ -522,8 +525,12 @@ static int addr_resolve(struct sockaddr *src_in, if (resolve_neigh) ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - ndev = rt->dst.dev; - dev_hold(ndev); + if (addr->bound_dev_if) { + ndev = dev_get_by_index(addr->net, addr->bound_dev_if); + } else { + ndev = rt->dst.dev; + dev_hold(ndev); + } ip_rt_put(rt); } else { @@ -539,19 +546,63 @@ static int addr_resolve(struct sockaddr *src_in, if (resolve_neigh) ret = addr_resolve_neigh(dst, dst_in, addr, seq); - ndev = dst->dev; - dev_hold(ndev); + if (addr->bound_dev_if) { + ndev = dev_get_by_index(addr->net, addr->bound_dev_if); + } else { + ndev = dst->dev; + dev_hold(ndev); + } dst_release(dst); } - addr->bound_dev_if = ndev->ifindex; - addr->net = dev_net(ndev); + if (ndev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip(dst_in, addr, NULL); + /* + * Put the loopback device and get the translated + * device instead. + */ + dev_put(ndev); + ndev = dev_get_by_index(addr->net, addr->bound_dev_if); + } else { + addr->bound_dev_if = ndev->ifindex; + } dev_put(ndev); return ret; } +static void process_one_req(struct work_struct *_work) +{ + struct addr_req *req; + struct sockaddr *src_in, *dst_in; + + mutex_lock(&lock); + req = container_of(_work, struct addr_req, work.work); + + if (req->status == -ENODATA) { + src_in = (struct sockaddr *)&req->src_addr; + dst_in = (struct sockaddr *)&req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr, + true, req->seq); + if (req->status && time_after_eq(jiffies, req->timeout)) { + req->status = -ETIMEDOUT; + } else if (req->status == -ENODATA) { + /* requeue the work for retrying again */ + set_timeout(&req->work, req->timeout); + mutex_unlock(&lock); + return; + } + } + list_del(&req->list); + mutex_unlock(&lock); + + req->callback(req->status, (struct sockaddr *)&req->src_addr, + req->addr, req->context); + put_client(req->client); + kfree(req); +} + static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; @@ -569,20 +620,23 @@ static void process_req(struct work_struct *work) true, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; - else if (req->status == -ENODATA) + else if (req->status == -ENODATA) { + set_timeout(&req->work, req->timeout); continue; + } } list_move_tail(&req->list, &done_list); } - if (!list_empty(&req_list)) { - req = list_entry(req_list.next, struct addr_req, list); - set_timeout(req->timeout); - } mutex_unlock(&lock); list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); + /* It is safe to cancel other work items from this work item + * because at a time there can be only one work item running + * with this single threaded work queue. + */ + cancel_delayed_work(&req->work); req->callback(req->status, (struct sockaddr *) &req->src_addr, req->addr, req->context); put_client(req->client); @@ -625,6 +679,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->context = context; req->client = client; atomic_inc(&client->refcount); + INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); @@ -679,7 +734,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) req->status = -ECANCELED; req->timeout = jiffies; list_move(&req->list, &req_list); - set_timeout(req->timeout); + set_timeout(&req->work, req->timeout); break; } } @@ -785,9 +840,8 @@ static int netevent_callback(struct notifier_block *self, unsigned long event, if (event == NETEVENT_NEIGH_UPDATE) { struct neighbour *neigh = ctx; - if (neigh->nud_state & NUD_VALID) { - set_timeout(jiffies); - } + if (neigh->nud_state & NUD_VALID) + set_timeout(&work, jiffies); } return 0; } @@ -798,7 +852,7 @@ static struct notifier_block nb = { int addr_init(void) { - addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); + addr_wq = alloc_ordered_workqueue("ib_addr", WQ_MEM_RECLAIM); if (!addr_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index efc94304dee3..77515638c55c 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1199,30 +1199,23 @@ int ib_cache_setup_one(struct ib_device *device) device->cache.ports = kzalloc(sizeof(*device->cache.ports) * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); - if (!device->cache.ports) { - err = -ENOMEM; - goto out; - } + if (!device->cache.ports) + return -ENOMEM; err = gid_table_setup_one(device); - if (err) - goto out; + if (err) { + kfree(device->cache.ports); + device->cache.ports = NULL; + return err; + } for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) ib_cache_update(device, p + rdma_start_port(device), true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); - err = ib_register_event_handler(&device->cache.event_handler); - if (err) - goto err; - + ib_register_event_handler(&device->cache.event_handler); return 0; - -err: - gid_table_cleanup_one(device); -out: - return err; } void ib_cache_release_one(struct ib_device *device) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2b4d613a3474..4c4b46586af2 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -373,11 +373,19 @@ out: return ret; } -static int cm_alloc_response_msg(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc, - struct ib_mad_send_buf **msg) +static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); +} + +static int cm_create_response_msg_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf *msg) { - struct ib_mad_send_buf *m; struct ib_ah *ah; ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, @@ -385,27 +393,40 @@ static int cm_alloc_response_msg(struct cm_port *port, if (IS_ERR(ah)) return PTR_ERR(ah); - m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, - 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, - IB_MGMT_BASE_VERSION); - if (IS_ERR(m)) { - rdma_destroy_ah(ah); - return PTR_ERR(m); - } - m->ah = ah; - *msg = m; + msg->ah = ah; return 0; } static void cm_free_msg(struct ib_mad_send_buf *msg) { - rdma_destroy_ah(msg->ah); + if (msg->ah) + rdma_destroy_ah(msg->ah); if (msg->context[0]) cm_deref_id(msg->context[0]); ib_free_send_mad(msg); } +static int cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf **msg) +{ + struct ib_mad_send_buf *m; + int ret; + + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + if (IS_ERR(m)) + return PTR_ERR(m); + + ret = cm_create_response_msg_ah(port, mad_recv_wc, m); + if (ret) { + cm_free_msg(m); + return ret; + } + + *msg = m; + return 0; +} + static void * cm_copy_private_data(const void *private_data, u8 private_data_len) { @@ -1175,6 +1196,11 @@ static void cm_format_req(struct cm_req_msg *req_msg, { struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; + bool pri_ext = false; + + if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) + pri_ext = opa_is_extended_lid(pri_path->opa.dlid, + pri_path->opa.slid); cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -1202,18 +1228,24 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_srq(req_msg, param->srq); } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + if (pri_ext) { + req_msg->primary_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + req_msg->primary_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = + req_msg->primary_local_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_slid(pri_path))); - req_msg->primary_remote_lid = + req_msg->primary_remote_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_dlid(pri_path))); } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } - req_msg->primary_local_gid = pri_path->sgid; - req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; @@ -1225,17 +1257,29 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_path->packet_life_time)); if (alt_path) { + bool alt_ext = false; + + if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alt_path->opa.dlid, + alt_path->opa.slid); + + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; + if (alt_ext) { + req_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + req_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + } if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = + req_msg->alt_local_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_slid(alt_path))); - req_msg->alt_remote_lid = + req_msg->alt_remote_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_dlid(alt_path))); } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } - req_msg->alt_local_gid = alt_path->sgid; - req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); @@ -1405,16 +1449,63 @@ static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); } +static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) +{ + return ((req_msg->alt_local_lid) || + (ib_is_opa_gid(&req_msg->alt_local_gid))); +} + +static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, + struct sa_path_rec *path, union ib_gid *gid) +{ + if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) + path->rec_type = SA_PATH_REC_TYPE_OPA; + else + path->rec_type = SA_PATH_REC_TYPE_IB; +} + +static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path) +{ + u32 lid; + + if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(primary_path, + htonl(ntohs(req_msg->primary_local_lid))); + sa_path_set_slid(primary_path, + htonl(ntohs(req_msg->primary_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->primary_local_gid); + sa_path_set_dlid(primary_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid); + sa_path_set_slid(primary_path, cpu_to_be32(lid)); + } + + if (!cm_req_has_alt_path(req_msg)) + return; + + if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(alt_path, + htonl(ntohs(req_msg->alt_local_lid))); + sa_path_set_slid(alt_path, + htonl(ntohs(req_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->alt_local_gid); + sa_path_set_dlid(alt_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid); + sa_path_set_slid(alt_path, cpu_to_be32(lid)); + } +} + static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path) { primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; - sa_path_set_dlid(primary_path, - htonl(ntohs(req_msg->primary_local_lid))); - sa_path_set_slid(primary_path, - htonl(ntohs(req_msg->primary_remote_lid))); primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; @@ -1431,13 +1522,9 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = req_msg->service_id; - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; - sa_path_set_dlid(alt_path, - htonl(ntohs(req_msg->alt_local_lid))); - sa_path_set_slid(alt_path, - htonl(ntohs(req_msg->alt_remote_lid))); alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; @@ -1454,6 +1541,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = req_msg->service_id; } + cm_format_path_lid_from_req(req_msg, primary_path, alt_path); } static u16 cm_get_bth_pkey(struct cm_work *work) @@ -1703,7 +1791,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = cpu_to_be16(wc->slid); + req_msg->primary_local_lid = ib_lid_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } @@ -1713,7 +1801,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = cpu_to_be16(wc->slid); + req_msg->alt_local_lid = ib_lid_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } @@ -1784,9 +1872,12 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); @@ -1811,16 +1902,19 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, cm_id_priv); if (ret) { @@ -2424,7 +2518,8 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) goto unlock; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, @@ -2432,7 +2527,8 @@ static int cm_dreq_handler(struct cm_work *work) cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); - if (ib_post_send_mad(msg, NULL)) + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: @@ -2843,6 +2939,11 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, const void *private_data, u8 private_data_len) { + bool alt_ext = false; + + if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, + alternate_path->opa.slid); cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; @@ -2856,6 +2957,12 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, htons(ntohl(sa_path_get_dlid(alternate_path))); lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; + if (alt_ext) { + lap_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid)); + lap_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid)); + } cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; @@ -2924,16 +3031,29 @@ out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_lap); +static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, + struct sa_path_rec *path) +{ + u32 lid; + + if (path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); + sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid); + sa_path_set_dlid(path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid); + sa_path_set_slid(path, cpu_to_be32(lid)); + } +} + static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { - memset(path, 0, sizeof *path); - path->rec_type = SA_PATH_REC_TYPE_IB; path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; - sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); - sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); @@ -2947,6 +3067,7 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); path->packet_life_time -= (path->packet_life_time > 0); + cm_format_path_lid_from_lap(lap_msg, path); } static int cm_lap_handler(struct cm_work *work) @@ -2965,6 +3086,11 @@ static int cm_lap_handler(struct cm_work *work) return -EINVAL; param = &work->cm_event.param.lap_rcvd; + memset(&work->path[0], 0, sizeof(work->path[1])); + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &lap_msg->alt_local_gid); param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = &lap_msg->private_data; @@ -2980,7 +3106,8 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, @@ -2990,7 +3117,8 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); - if (ib_post_send_mad(msg, NULL)) + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: @@ -4201,7 +4329,7 @@ static int __init ib_cm_init(void) goto error1; } - cm.wq = create_workqueue("ib_cm"); + cm.wq = alloc_workqueue("ib_cm", 0, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 31bb82d8ecd7..852c8fec8088 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -72,6 +72,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", @@ -623,22 +624,11 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) ndev = dev_get_by_index(&init_net, bound_if_index); - if (ndev && ndev->flags & IFF_LOOPBACK) { - pr_info("detected loopback device\n"); - dev_put(ndev); - - if (!device->get_netdev) - return -EOPNOTSUPP; - - ndev = device->get_netdev(device, port); - if (!ndev) - return -ENODEV; - } - } else { + else gid_type = IB_GID_TYPE_IB; - } + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); @@ -1044,6 +1034,8 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); + qp_attr->port_num = id_priv->id.port_num; + *qp_attr_mask |= IB_QP_PORT; } else ret = -ENOSYS; @@ -2569,21 +2561,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - if (ndev->flags & IFF_LOOPBACK) { - dev_put(ndev); - if (!id_priv->id.device->get_netdev) { - ret = -EOPNOTSUPP; - goto err2; - } - - ndev = id_priv->id.device->get_netdev(id_priv->id.device, - id_priv->id.port_num); - if (!ndev) { - ret = -ENODEV; - goto err2; - } - } - supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, @@ -4022,7 +3999,8 @@ static void iboe_mcast_work_handler(struct work_struct *work) kfree(mw); } -static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; @@ -4032,8 +4010,8 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = 0xff; - mgid->raw[1] = 0x0e; + mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; + mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4074,7 +4052,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, goto out1; } - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) @@ -4090,8 +4070,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; @@ -4304,8 +4282,12 @@ static void cma_add_one(struct ib_device *device) for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); - cma_dev->default_gid_type[i - rdma_start_port(device)] = - find_first_bit(&supported_gids, BITS_PER_LONG); + if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) + cma_dev->default_gid_type[i - rdma_start_port(device)] = + CMA_PREFERRED_ROCE_GID_TYPE; + else + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } @@ -4476,9 +4458,8 @@ out: return skb->len; } -static const struct ibnl_client_cbs cma_cb_table[] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats, - .module = THIS_MODULE }, +static const struct rdma_nl_cbs cma_cb_table[] = { + [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, }; static int cma_init_net(struct net *net) @@ -4530,9 +4511,7 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), - cma_cb_table)) - pr_warn("RDMA CMA: failed to add netlink callback\n"); + rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4549,7 +4528,7 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - ibnl_remove_client(RDMA_NL_RDMA_CM); + rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); @@ -4558,5 +4537,7 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); + module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 11ae67514e13..a1d687a664f8 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,7 @@ #include <linux/cgroup_rdma.h> #include <rdma/ib_verbs.h> +#include <rdma/opa_addr.h> #include <rdma/ib_mad.h> #include "mad_priv.h" @@ -102,6 +103,14 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, roce_netdev_callback cb, void *cookie); +typedef int (*nldev_callback)(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx); + +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE @@ -179,8 +188,8 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int ibnl_init(void); -void ibnl_cleanup(void); +int rdma_nl_init(void); +void rdma_nl_exit(void); /** * Check if there are any listeners to the netlink group @@ -190,11 +199,14 @@ void ibnl_cleanup(void); int ibnl_chk_listeners(unsigned int group); int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_nl_handle_ip_res_resp(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_get_cached_subnet_prefix(struct ib_device *device, u8 port_num, @@ -301,4 +313,9 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, return 0; } #endif + +struct ib_device *__ib_device_get_by_index(u32 ifindex); +/* RDMA device netlink */ +void nldev_init(void); +void nldev_exit(void); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a5dfab6adf49..84fc32a2c8b3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -134,6 +134,17 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } +struct ib_device *__ib_device_get_by_index(u32 index) +{ + struct ib_device *device; + + list_for_each_entry(device, &device_list, core_list) + if (device->index == index) + return device; + + return NULL; +} + static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; @@ -145,7 +156,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } - static int alloc_name(char *name) { unsigned long *inuse; @@ -326,10 +336,10 @@ static int read_port_immutable(struct ib_device *device) return 0; } -void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->get_dev_fw_str) - dev->get_dev_fw_str(dev, str, str_len); + dev->get_dev_fw_str(dev, str); else str[0] = '\0'; } @@ -395,6 +405,30 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, } /** + * __dev_new_index - allocate an device index + * + * Returns a suitable unique value for a new device interface + * number. It assumes that there are less than 2^32-1 ib devices + * will be present in the system. + */ +static u32 __dev_new_index(void) +{ + /* + * The device index to allow stable naming. + * Similar to struct net -> ifindex. + */ + static u32 index; + + for (;;) { + if (!(++index)) + index = 1; + + if (!__ib_device_get_by_index(index)) + return index; + } +} + +/** * ib_register_device - Register an IB device with IB core * @device:Device to register * @@ -489,9 +523,10 @@ int ib_register_device(struct ib_device *device, device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) - if (client->add && !add_client_context(device, client)) + if (!add_client_context(device, client) && client->add) client->add(device); + device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); @@ -537,10 +572,11 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); - mutex_unlock(&device_mutex); - ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); + + mutex_unlock(&device_mutex); + ib_cache_cleanup_one(device); ib_security_destroy_port_pkey_list(device); @@ -577,7 +613,7 @@ int ib_register_client(struct ib_client *client) mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) - if (client->add && !add_client_context(device, client)) + if (!add_client_context(device, client) && client->add) client->add(device); down_write(&lists_rwsem); @@ -711,7 +747,7 @@ EXPORT_SYMBOL(ib_set_client_data); * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ -int ib_register_event_handler (struct ib_event_handler *event_handler) +void ib_register_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; @@ -719,8 +755,6 @@ int ib_register_event_handler (struct ib_event_handler *event_handler) list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_register_event_handler); @@ -731,15 +765,13 @@ EXPORT_SYMBOL(ib_register_event_handler); * Unregister an event handler registered with * ib_register_event_handler(). */ -int ib_unregister_event_handler(struct ib_event_handler *event_handler) +void ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); @@ -893,6 +925,31 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, } /** + * ib_enum_all_devs - enumerate all ib_devices + * @cb: Callback to call for each found ib_device + * + * Enumerates all ib_devices and calls callback() on each device. + */ +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ib_device *dev; + unsigned int idx = 0; + int ret = 0; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) { + ret = nldev_cb(dev, skb, cb, idx); + if (ret) + break; + idx++; + } + + up_read(&lists_rwsem); + return ret; +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query @@ -944,14 +1001,17 @@ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { - if (!device->modify_port) - return -ENOSYS; + int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - return device->modify_port(device, port_num, port_modify_mask, - port_modify); + if (device->modify_port) + rc = device->modify_port(device, port_num, port_modify_mask, + port_modify); + else + rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; + return rc; } EXPORT_SYMBOL(ib_modify_port); @@ -1086,29 +1146,21 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); -static struct ibnl_client_cbs ibnl_ls_cb_table[] = { +static const struct rdma_nl_cbs ibnl_ls_cb_table[] = { [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, + .doit = ib_nl_handle_resolve_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, + .doit = ib_nl_handle_set_timeout, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_IP_RESOLVE] = { - .dump = ib_nl_handle_ip_res_resp, - .module = THIS_MODULE }, + .doit = ib_nl_handle_ip_res_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, }; -static int ib_add_ibnl_clients(void) -{ - return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), - ibnl_ls_cb_table); -} - -static void ib_remove_ibnl_clients(void) -{ - ibnl_remove_client(RDMA_NL_LS); -} - static int __init ib_core_init(void) { int ret; @@ -1130,9 +1182,9 @@ static int __init ib_core_init(void) goto err_comp; } - ret = ibnl_init(); + ret = rdma_nl_init(); if (ret) { - pr_warn("Couldn't init IB netlink interface\n"); + pr_warn("Couldn't init IB netlink interface: err %d\n", ret); goto err_sysfs; } @@ -1154,24 +1206,18 @@ static int __init ib_core_init(void) goto err_mad; } - ret = ib_add_ibnl_clients(); - if (ret) { - pr_warn("Couldn't register ibnl clients\n"); - goto err_sa; - } - ret = register_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); - goto err_ibnl_clients; + goto err_sa; } + nldev_init(); + rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ib_cache_setup(); return 0; -err_ibnl_clients: - ib_remove_ibnl_clients(); err_sa: ib_sa_cleanup(); err_mad: @@ -1179,7 +1225,7 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - ibnl_cleanup(); + rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -1191,18 +1237,21 @@ err: static void __exit ib_core_cleanup(void) { - unregister_lsm_notifier(&ibdev_lsm_nb); ib_cache_cleanup(); - ib_remove_ibnl_clients(); + nldev_exit(); + rdma_nl_unregister(RDMA_NL_LS); + unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); - ibnl_cleanup(); + rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); + module_init(ib_core_init); module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 31661b5c1743..fcf42f6bb82a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason) } EXPORT_SYMBOL(iwcm_reject_msg); -static struct ibnl_client_cbs iwcm_nl_cb_table[] = { +static struct rdma_nl_cbs iwcm_nl_cb_table[] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, @@ -1175,13 +1175,9 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - - ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), - iwcm_nl_cb_table); - if (ret) - pr_err("iw_cm: couldn't register netlink callbacks\n"); - - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); + else + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); if (!iwcm_wq) return -ENOMEM; @@ -1200,9 +1196,11 @@ static void __exit iw_cm_cleanup(void) { unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - ibnl_remove_client(RDMA_NL_IWCM); + rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); + module_init(iw_cm_init); module_exit(iw_cm_cleanup); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index a0e7c16d8bd8..8861c052155a 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -42,7 +42,6 @@ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -EXPORT_SYMBOL(iwpm_valid_pid); /* * iwpm_register_pid - Send a netlink query to user space @@ -101,10 +100,12 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) if (ret) goto pid_query_error; + nlmsg_end(skb, nlh); + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; @@ -122,7 +123,6 @@ pid_query_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_register_pid); /* * iwpm_add_mapping - Send a netlink add mapping message @@ -172,9 +172,11 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); if (ret) goto add_mapping_error; + + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -191,7 +193,6 @@ add_mapping_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_add_mapping); /* * iwpm_add_and_query_mapping - Send a netlink add and query @@ -249,9 +250,11 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); if (ret) goto query_mapping_error; + + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -267,7 +270,6 @@ query_mapping_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_add_and_query_mapping); /* * iwpm_remove_mapping - Send a netlink remove mapping message @@ -312,7 +314,9 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -328,7 +332,6 @@ remove_mapping_error: dev_kfree_skb_any(skb); return ret; } -EXPORT_SYMBOL(iwpm_remove_mapping); /* netlink attribute policy for the received response to register pid request */ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { @@ -397,7 +400,6 @@ register_pid_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_register_pid_cb); /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { @@ -466,7 +468,6 @@ add_mapping_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_add_mapping_cb); /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ @@ -558,7 +559,6 @@ query_mapping_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); /* * iwpm_remote_info_cb - Process a port mapper message, containing @@ -627,7 +627,6 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) "remote_info: Mapped remote sockaddr:"); return ret; } -EXPORT_SYMBOL(iwpm_remote_info_cb); /* netlink attribute policy for the received request for mapping info */ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { @@ -677,7 +676,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); return ret; } -EXPORT_SYMBOL(iwpm_mapping_info_cb); /* netlink attribute policy for the received mapping info ack */ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { @@ -707,7 +705,6 @@ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); return 0; } -EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); /* netlink attribute policy for the received port mapper error message */ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { @@ -751,4 +748,3 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index f13870e69ccd..3c4faadb8cdd 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -54,8 +54,6 @@ static struct iwpm_admin_data iwpm_admin; int iwpm_init(u8 nl_client) { int ret = 0; - if (iwpm_valid_client(nl_client)) - return -EINVAL; mutex_lock(&iwpm_admin_lock); if (atomic_read(&iwpm_admin.refcount) == 0) { iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE * @@ -83,7 +81,6 @@ init_exit: } return ret; } -EXPORT_SYMBOL(iwpm_init); static void free_hash_bucket(void); static void free_reminfo_bucket(void); @@ -109,7 +106,6 @@ int iwpm_exit(u8 nl_client) iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } -EXPORT_SYMBOL(iwpm_exit); static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); @@ -148,7 +144,6 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_create_mapinfo); int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) @@ -184,7 +179,6 @@ remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_remove_mapinfo); static void free_hash_bucket(void) { @@ -297,7 +291,6 @@ get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_get_remote_info); struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) @@ -383,15 +376,11 @@ int iwpm_get_nlmsg_seq(void) int iwpm_valid_client(u8 nl_client) { - if (nl_client >= RDMA_NL_NUM_CLIENTS) - return 0; return iwpm_admin.client_list[nl_client]; } void iwpm_set_valid(u8 nl_client, int valid) { - if (nl_client >= RDMA_NL_NUM_CLIENTS) - return; iwpm_admin.client_list[nl_client] = valid; } @@ -608,7 +597,10 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; - ret = ibnl_unicast(skb, nlh, iwpm_pid); + + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -637,7 +629,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; @@ -689,6 +681,8 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + nlmsg_end(skb, nlh); + iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 0d3cca0a8890..e5cf09c66fe6 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -64,7 +64,7 @@ struct mad_rmpp_recv { __be64 tid; u32 src_qp; - u16 slid; + u32 slid; u8 mgmt_class; u8 class_version; u8 method; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 94931c474d41..1fb72c356e36 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved. * Copyright (c) 2010 Voltaire Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -37,239 +38,278 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> +#include <linux/module.h> #include "core_priv.h" -struct ibnl_client { - struct list_head list; - int index; - int nops; - const struct ibnl_client_cbs *cb_table; -}; +#include "core_priv.h" -static DEFINE_MUTEX(ibnl_mutex); +static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; -static LIST_HEAD(client_list); +static struct { + const struct rdma_nl_cbs *cb_table; +} rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int ibnl_chk_listeners(unsigned int group) +int rdma_nl_chk_listeners(unsigned int group) { - if (netlink_has_listeners(nls, group) == 0) - return -1; - return 0; + return (netlink_has_listeners(nls, group)) ? 0 : -1; } +EXPORT_SYMBOL(rdma_nl_chk_listeners); -int ibnl_add_client(int index, int nops, - const struct ibnl_client_cbs cb_table[]) +static bool is_nl_msg_valid(unsigned int type, unsigned int op) { - struct ibnl_client *cur; - struct ibnl_client *nl_client; + static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { + [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS, + [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, + [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, + [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, + }; - nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL); - if (!nl_client) - return -ENOMEM; + /* + * This BUILD_BUG_ON is intended to catch addition of new + * RDMA netlink protocol without updating the array above. + */ + BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); - nl_client->index = index; - nl_client->nops = nops; - nl_client->cb_table = cb_table; + if (type >= RDMA_NL_NUM_CLIENTS) + return false; - mutex_lock(&ibnl_mutex); + return (op < max_num_ops[type]) ? true : false; +} - list_for_each_entry(cur, &client_list, list) { - if (cur->index == index) { - pr_warn("Client for %d already exists\n", index); - mutex_unlock(&ibnl_mutex); - kfree(nl_client); - return -EINVAL; - } +static bool is_nl_valid(unsigned int type, unsigned int op) +{ + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_msg_valid(type, op)) + return false; + + cb_table = rdma_nl_types[type].cb_table; +#ifdef CONFIG_MODULES + if (!cb_table) { + mutex_unlock(&rdma_nl_mutex); + request_module("rdma-netlink-subsys-%d", type); + mutex_lock(&rdma_nl_mutex); + cb_table = rdma_nl_types[type].cb_table; } +#endif - list_add_tail(&nl_client->list, &client_list); - - mutex_unlock(&ibnl_mutex); - - return 0; + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) + return false; + return true; } -EXPORT_SYMBOL(ibnl_add_client); -int ibnl_remove_client(int index) +void rdma_nl_register(unsigned int index, + const struct rdma_nl_cbs cb_table[]) { - struct ibnl_client *cur, *next; - - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - if (cur->index == index) { - list_del(&(cur->list)); - mutex_unlock(&ibnl_mutex); - kfree(cur); - return 0; - } + mutex_lock(&rdma_nl_mutex); + if (!is_nl_msg_valid(index, 0)) { + /* + * All clients are not interesting in success/failure of + * this call. They want to see the print to error log and + * continue their initialization. Print warning for them, + * because it is programmer's error to be here. + */ + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The not-valid %u index was supplied to RDMA netlink\n", + index); + return; } - pr_warn("Can't remove callback for client idx %d. Not found\n", index); - mutex_unlock(&ibnl_mutex); - return -EINVAL; + if (rdma_nl_types[index].cb_table) { + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The %u index is already registered in RDMA netlink\n", + index); + return; + } + + rdma_nl_types[index].cb_table = cb_table; + mutex_unlock(&rdma_nl_mutex); +} +EXPORT_SYMBOL(rdma_nl_register); + +void rdma_nl_unregister(unsigned int index) +{ + mutex_lock(&rdma_nl_mutex); + rdma_nl_types[index].cb_table = NULL; + mutex_unlock(&rdma_nl_mutex); } -EXPORT_SYMBOL(ibnl_remove_client); +EXPORT_SYMBOL(rdma_nl_unregister); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int len, int client, int op, int flags) { - unsigned char *prev_tail; - - prev_tail = skb_tail_pointer(skb); - *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), - len, flags); + *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags); if (!*nlh) - goto out_nlmsg_trim; - (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; + return NULL; return nlmsg_data(*nlh); - -out_nlmsg_trim: - nlmsg_trim(skb, prev_tail); - return NULL; } EXPORT_SYMBOL(ibnl_put_msg); int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type) { - unsigned char *prev_tail; - - prev_tail = skb_tail_pointer(skb); - if (nla_put(skb, type, len, data)) - goto nla_put_failure; - nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail; + if (nla_put(skb, type, len, data)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } return 0; - -nla_put_failure: - nlmsg_trim(skb, prev_tail - nlh->nlmsg_len); - return -EMSGSIZE; } EXPORT_SYMBOL(ibnl_put_attr); -static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct ibnl_client *client; int type = nlh->nlmsg_type; - int index = RDMA_NL_GET_CLIENT(type); + unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_valid(index, op)) + return -EINVAL; + + cb_table = rdma_nl_types[index].cb_table; + + if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; - list_for_each_entry(client, &client_list, list) { - if (client->index == index) { - if (op >= client->nops || !client->cb_table[op].dump) - return -EINVAL; - - /* - * For response or local service set_timeout request, - * there is no need to use netlink_dump_start. - */ - if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - (index == RDMA_NL_LS && - op == RDMA_NL_LS_OP_SET_TIMEOUT)) { - struct netlink_callback cb = { - .skb = skb, - .nlh = nlh, - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - - return cb.dump(skb, &cb); - } - - { - struct netlink_dump_control c = { - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - return netlink_dump_start(nls, skb, nlh, &c); - } - } + /* + * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't + * mistakenly call the .dump() function. + */ + if (index == RDMA_NL_LS) { + if (cb_table[op].doit) + return cb_table[op].doit(skb, nlh, extack); + return -EINVAL; } + /* FIXME: Convert IWCM to properly handle doit callbacks */ + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || + index == RDMA_NL_IWCM) { + struct netlink_dump_control c = { + .dump = cb_table[op].dump, + }; + if (c.dump) + return netlink_dump_start(nls, skb, nlh, &c); + return -EINVAL; + } + + if (cb_table[op].doit) + return cb_table[op].doit(skb, nlh, extack); - pr_info("Index %d wasn't found in client list\n", index); - return -EINVAL; + return 0; } -static void ibnl_rcv_reply_skb(struct sk_buff *skb) +/* + * This function is similar to netlink_rcv_skb with one exception: + * It calls to the callback for the netlink messages without NLM_F_REQUEST + * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed + * for that consumer only. + */ +static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, + struct netlink_ext_ack *)) { + struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; - int msglen; + int err; - /* - * Process responses until there is no more message or the first - * request. Generally speaking, it is not recommended to mix responses - * with requests. - */ while (skb->len >= nlmsg_total_size(0)) { + int msglen; + nlh = nlmsg_hdr(skb); + err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) - return; - - /* Handle response only */ - if (nlh->nlmsg_flags & NLM_F_REQUEST) - return; - - ibnl_rcv_msg(skb, nlh, NULL); + return 0; + /* + * Generally speaking, the only requests are handled + * by the kernel, but RDMA_NL_LS is different, because it + * runs backward netlink scheme. Kernel initiates messages + * and waits for reply with data to keep pathrecord cache + * in sync. + */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) && + (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS)) + goto ack; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + + err = cb(skb, nlh, &extack); + if (err == -EINTR) + goto skip; + +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) + netlink_ack(skb, nlh, err, &extack); + +skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } + + return 0; +} + +static void rdma_nl_rcv(struct sk_buff *skb) +{ + mutex_lock(&rdma_nl_mutex); + rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); + mutex_unlock(&rdma_nl_mutex); } -static void ibnl_rcv(struct sk_buff *skb) +int rdma_nl_unicast(struct sk_buff *skb, u32 pid) { - mutex_lock(&ibnl_mutex); - ibnl_rcv_reply_skb(skb); - netlink_rcv_skb(skb, &ibnl_rcv_msg); - mutex_unlock(&ibnl_mutex); + int err; + + err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); + return (err < 0) ? err : 0; } +EXPORT_SYMBOL(rdma_nl_unicast); -int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid) +int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) { int err; err = netlink_unicast(nls, skb, pid, 0); return (err < 0) ? err : 0; } -EXPORT_SYMBOL(ibnl_unicast); +EXPORT_SYMBOL(rdma_nl_unicast_wait); -int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, - unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) { return nlmsg_multicast(nls, skb, 0, group, flags); } -EXPORT_SYMBOL(ibnl_multicast); +EXPORT_SYMBOL(rdma_nl_multicast); -int __init ibnl_init(void) +int __init rdma_nl_init(void) { struct netlink_kernel_cfg cfg = { - .input = ibnl_rcv, + .input = rdma_nl_rcv, }; nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); - if (!nls) { - pr_warn("Failed to create netlink socket\n"); + if (!nls) return -ENOMEM; - } nls->sk_sndtimeo = 10 * HZ; return 0; } -void ibnl_cleanup(void) +void rdma_nl_exit(void) { - struct ibnl_client *cur, *next; + int idx; - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - list_del(&(cur->list)); - kfree(cur); - } - mutex_unlock(&ibnl_mutex); + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + rdma_nl_unregister(idx); netlink_kernel_release(nls); } + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c new file mode 100644 index 000000000000..2fae850a3eff --- /dev/null +++ b/drivers/infiniband/core/nldev.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <net/netlink.h> +#include <rdma/rdma_netlink.h> + +#include "core_priv.h" + +static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = IB_FW_VERSION_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, +}; + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) + return -EMSGSIZE; + + BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + device->attrs.device_cap_flags, 0)) + return -EMSGSIZE; + + ib_get_device_fw_str(device, fw); + /* Device without FW has strlen(fw) */ + if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, + be64_to_cpu(device->node_guid), 0)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, + be64_to_cpu(device->attrs.sys_image_guid), 0)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) + return -EMSGSIZE; + return 0; +} + +static int fill_port_info(struct sk_buff *msg, + struct ib_device *device, u32 port) +{ + struct ib_port_attr attr; + int ret; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + return -EMSGSIZE; + + ret = ib_query_port(device, port, &attr); + if (ret) + return ret; + + BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + (u64)attr.port_cap_flags, 0)) + return -EMSGSIZE; + if (rdma_protocol_ib(device, port) && + nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, + attr.subnet_prefix, 0)) + return -EMSGSIZE; + if (rdma_protocol_ib(device, port)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) + return -EMSGSIZE; + return 0; +} + +static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + + device = __ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_dev_info(msg, device); + if (err) { + nlmsg_free(msg); + return err; + } + + nlmsg_end(msg, nlh); + + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int _nldev_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, NLM_F_MULTI); + + if (fill_dev_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: cb->args[0] = idx; + return skb->len; +} + +static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + /* + * There is no need to take lock, because + * we are relying on ib_core's lists_rwsem + */ + return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); +} + +static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + u32 port; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = __ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_port_info(msg, device, port); + if (err) { + nlmsg_free(msg); + return err; + } + + nlmsg_end(msg, nlh); + + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int nldev_port_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + int start = cb->args[0]; + struct nlmsghdr *nlh; + u32 idx = 0; + u32 ifindex; + int err; + u32 p; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = __ib_device_get_by_index(ifindex); + if (!device) + return -EINVAL; + + for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + /* + * The dumpit function returns all information from specific + * index. This specific index is taken from the netlink + * messages request sent by user and it is available + * in cb->args[0]. + * + * Usually, the user doesn't fill this field and it causes + * to return everything. + * + */ + if (idx < start) { + idx++; + continue; + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_PORT_GET), + 0, NLM_F_MULTI); + + if (fill_port_info(skb, device, p)) { + nlmsg_cancel(skb, nlh); + goto out; + } + idx++; + nlmsg_end(skb, nlh); + } + +out: cb->args[0] = idx; + return skb->len; +} + +static const struct rdma_nl_cbs nldev_cb_table[] = { + [RDMA_NLDEV_CMD_GET] = { + .doit = nldev_get_doit, + .dump = nldev_get_dumpit, + }, + [RDMA_NLDEV_CMD_PORT_GET] = { + .doit = nldev_port_get_doit, + .dump = nldev_port_get_dumpit, + }, +}; + +void __init nldev_init(void) +{ + rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); +} + +void __exit nldev_exit(void) +{ + rdma_nl_unregister(RDMA_NL_NLDEV); +} + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 41c31a2bf093..85b5ee4defa4 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -35,10 +35,57 @@ #include <rdma/ib_verbs.h> #include <rdma/uverbs_types.h> #include <linux/rcupdate.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" +int uverbs_ns_idx(u16 *id, unsigned int ns_count) +{ + int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT; + + if (ret >= ns_count) + return -EINVAL; + + *id &= ~UVERBS_ID_NS_MASK; + return ret; +} + +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object) +{ + const struct uverbs_root_spec *object_hash = ibdev->specs_root; + const struct uverbs_object_spec_hash *objects; + int ret = uverbs_ns_idx(&object, object_hash->num_buckets); + + if (ret < 0) + return NULL; + + objects = object_hash->object_buckets[ret]; + + if (object >= objects->num_objects) + return NULL; + + return objects->objects[object]; +} + +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method) +{ + const struct uverbs_method_spec_hash *methods; + int ret = uverbs_ns_idx(&method, object->num_buckets); + + if (ret < 0) + return NULL; + + methods = object->method_buckets[ret]; + if (method >= methods->num_methods) + return NULL; + + return methods->methods[method]; +} + void uverbs_uobject_get(struct ib_uobject *uobject) { kref_get(&uobject->ref); @@ -404,6 +451,41 @@ int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj) return ret; } +static int null_obj_type_class_remove_commit(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +} + +static const struct uverbs_obj_type null_obj_type = { + .type_class = &((const struct uverbs_obj_type_class){ + .remove_commit = null_obj_type_class_remove_commit, + /* be cautious */ + .needs_kfree_rcu = true}), +}; + +int rdma_explicit_destroy(struct ib_uobject *uobject) +{ + int ret; + struct ib_ucontext *ucontext = uobject->context; + + /* Cleanup is running. Calling this should have been impossible */ + if (!down_read_trylock(&ucontext->cleanup_rwsem)) { + WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); + return 0; + } + lockdep_check(uobject, true); + ret = uobject->type->type_class->remove_commit(uobject, + RDMA_REMOVE_DESTROY); + if (ret) + return ret; + + uobject->type = &null_obj_type; + + up_read(&ucontext->cleanup_rwsem); + return 0; +} + static void alloc_commit_idr_uobject(struct ib_uobject *uobj) { uverbs_uobject_add(uobj); @@ -625,3 +707,100 @@ const struct uverbs_obj_type_class uverbs_fd_class = { .needs_kfree_rcu = false, }; +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id) +{ + switch (access) { + case UVERBS_ACCESS_READ: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, false); + case UVERBS_ACCESS_DESTROY: + case UVERBS_ACCESS_WRITE: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, true); + case UVERBS_ACCESS_NEW: + return rdma_alloc_begin_uobject(type_attrs, ucontext); + default: + WARN_ON(true); + return ERR_PTR(-EOPNOTSUPP); + } +} + +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit) +{ + int ret = 0; + + /* + * refcounts should be handled at the object level and not at the + * uobject level. Refcounts of the objects themselves are done in + * handlers. + */ + + switch (access) { + case UVERBS_ACCESS_READ: + rdma_lookup_put_uobject(uobj, false); + break; + case UVERBS_ACCESS_WRITE: + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_DESTROY: + if (commit) + ret = rdma_remove_commit_uobject(uobj); + else + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_NEW: + if (commit) + ret = rdma_alloc_commit_uobject(uobj); + else + rdma_alloc_abort_uobject(uobj); + break; + default: + WARN_ON(true); + ret = -EOPNOTSUPP; + } + + return ret; +} + +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < num; i++) { + struct uverbs_attr_bundle_hash *curr_bundle = + &attrs_bundle->hash[i]; + const struct uverbs_attr_spec_hash *curr_spec_bucket = + spec_hash[i]; + unsigned int j; + + for (j = 0; j < curr_bundle->num_attrs; j++) { + struct uverbs_attr *attr; + const struct uverbs_attr_spec *spec; + + if (!uverbs_attr_is_valid_in_hash(curr_bundle, j)) + continue; + + attr = &curr_bundle->attrs[j]; + spec = &curr_spec_bucket->attrs[j]; + + if (spec->type == UVERBS_ATTR_TYPE_IDR || + spec->type == UVERBS_ATTR_TYPE_FD) { + int current_ret; + + current_ret = uverbs_finalize_object(attr->obj_attr.uobject, + spec->obj.access, + commit); + if (!ret) + ret = current_ret; + } + } + } + return ret; +} diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 1b82e7ff7fe8..1efcf93238dd 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -39,9 +39,15 @@ #include <linux/idr.h> #include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> #include <rdma/ib_verbs.h> #include <linux/mutex.h> +int uverbs_ns_idx(u16 *id, unsigned int ns_count); +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object); +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method); /* * These functions initialize the context and cleanups its uobjects. * The context has a list of objects which is protected by a mutex @@ -75,4 +81,40 @@ void uverbs_uobject_put(struct ib_uobject *uobject); */ void uverbs_close_fd(struct file *f); +/* + * Get an ib_uobject that corresponds to the given id from ucontext, assuming + * the object is from the given type. Lock it to the required access when + * applicable. + * This function could create (access == NEW), destroy (access == DESTROY) + * or unlock (access == READ || access == WRITE) objects if required. + * The action will be finalized only when uverbs_finalize_object or + * uverbs_finalize_objects are called. + */ +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id); +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit); +/* + * Note that certain finalize stages could return a status: + * (a) alloc_commit could return a failure if the object is committed at the + * same time when the context is destroyed. + * (b) remove_commit could fail if the object wasn't destroyed successfully. + * Since multiple objects could be finalized in one transaction, it is very NOT + * recommended to have several finalize actions which have side effects. + * For example, it's NOT recommended to have a certain action which has both + * a commit action and a destroy action or two destroy objects in the same + * action. The rule of thumb is to have one destroy or commit action with + * multiple lookups. + * The first non zero return value of finalize_object is returned from this + * function. For example, this could happen when we couldn't destroy an + * object. + */ +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit); + #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index db958d3207ef..90e3889b7fbe 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -42,6 +42,10 @@ #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> +static struct workqueue_struct *gid_cache_wq; + +static struct workqueue_struct *gid_cache_wq; + enum gid_op_type { GID_DEL = 0, GID_ADD @@ -560,7 +564,7 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); - queue_work(ib_wq, &ndev_work->work); + queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } @@ -693,7 +697,7 @@ static int addr_event(struct notifier_block *this, unsigned long event, dev_hold(ndev); work->gid_attr.ndev = ndev; - queue_work(ib_wq, &work->work); + queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } @@ -740,6 +744,10 @@ static struct notifier_block nb_inet6addr = { int __init roce_gid_mgmt_init(void) { + gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); + if (!gid_cache_wq) + return -ENOMEM; + register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); @@ -764,4 +772,5 @@ void __exit roce_gid_mgmt_cleanup(void) * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ + destroy_workqueue(gid_cache_wq); } diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index dbfd854c32c9..6ca607e8e293 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -643,6 +643,30 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, } EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); +/** + * rdma_rw_mr_factor - return number of MRs required for a payload + * @device: device handling the connection + * @port_num: port num to which the connection is bound + * @maxpages: maximum payload pages per rdma_rw_ctx + * + * Returns the number of MRs the device requires to move @maxpayload + * bytes. The returned value is used during transport creation to + * compute max_rdma_ctxts and the size of the transport's Send and + * Send Completion Queues. + */ +unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num, + unsigned int maxpages) +{ + unsigned int mr_pages; + + if (rdma_rw_can_use_mr(device, port_num)) + mr_pages = rdma_rw_fr_page_list_len(device); + else + mr_pages = device->attrs.max_sge_rd; + return DIV_ROUND_UP(maxpages, mr_pages); +} +EXPORT_SYMBOL(rdma_rw_mr_factor); + void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) { u32 factor; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 70fa4cabe48e..ab5e1024fea9 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -50,6 +50,7 @@ #include <uapi/rdma/ib_user_sa.h> #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> +#include <rdma/opa_addr.h> #include "sa.h" #include "core_priv.h" @@ -861,7 +862,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); if (!ret) ret = len; else @@ -1021,9 +1022,9 @@ static void ib_nl_request_timeout(struct work_struct *work) } int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; int timeout, delta, abs_delta; const struct nlattr *attr; unsigned long flags; @@ -1033,8 +1034,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, int ret; if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -1098,9 +1098,9 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) } int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; unsigned long flags; struct ib_sa_query *query; struct ib_mad_send_buf *send_buf; @@ -1109,8 +1109,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); @@ -1241,6 +1240,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))) + rdma_ah_set_make_grd(ah_attr, true); + rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & get_src_path_mask(device, port_num)); @@ -1420,7 +1424,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } @@ -2290,12 +2294,15 @@ static void update_sm_ah(struct work_struct *work) rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); rdma_ah_set_port_num(&ah_attr, port->port_num); if (port_attr.grh_required) { - rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); - - rdma_ah_set_subnet_prefix(&ah_attr, - cpu_to_be64(port_attr.subnet_prefix)); - rdma_ah_set_interface_id(&ah_attr, - cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) { + rdma_ah_set_make_grd(&ah_attr, true); + } else { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + } } new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr); @@ -2410,8 +2417,7 @@ static void ib_sa_add_one(struct ib_device *device) */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); - if (ib_register_event_handler(&sa_dev->event_handler)) - goto err; + ib_register_event_handler(&sa_dev->event_handler); for (i = 0; i <= e - s; ++i) { if (rdma_cap_ib_sa(device, i + 1)) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 70ad19c4c73e..88bdafb297f5 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -432,8 +432,10 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) atomic_set(&qp->qp_sec->error_list_count, 0); init_completion(&qp->qp_sec->error_complete); ret = security_ib_alloc_security(&qp->qp_sec->security); - if (ret) + if (ret) { kfree(qp->qp_sec); + qp->qp_sec = NULL; + } return ret; } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7ebe1ef23652..abc5ab581f82 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1210,8 +1210,8 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct ib_device *dev = container_of(device, struct ib_device, dev); - ib_get_device_fw_str(dev, buf, PAGE_SIZE); - strlcat(buf, "\n", PAGE_SIZE); + ib_get_device_fw_str(dev, buf); + strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 112099c86a19..f2a7f62c2834 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -618,7 +618,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, if (result) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 276f0ef835bd..eb85b546e223 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -248,14 +248,15 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, dst->qp_num = src->qp_num; } -static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, +static void ucma_copy_ud_event(struct ib_device *device, + struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } @@ -335,7 +336,8 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, uevent->resp.event = event->event; uevent->resp.status = event->status; if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); + ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, + &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); @@ -1157,7 +1159,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, if (ret) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 8c4ec564e495..55e8f5ed8b3c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -166,24 +166,6 @@ static int invalidate_page_trampoline(struct ib_umem *item, u64 start, return 0; } -static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); - - if (!context->invalidate_range) - return; - - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, address, - address + PAGE_SIZE, - invalidate_page_trampoline, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); -} - static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, u64 end, void *cookie) { @@ -237,7 +219,6 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, static const struct mmu_notifier_ops ib_umem_notifiers = { .release = ib_umem_notifier_release, - .invalidate_page = ib_umem_notifier_invalidate_page, .invalidate_range_start = ib_umem_notifier_invalidate_range_start, .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c index d176597b4d78..fc801920e341 100644 --- a/drivers/infiniband/core/umem_rbtree.c +++ b/drivers/infiniband/core/umem_rbtree.c @@ -72,7 +72,7 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, /* @last is not a part of the interval. See comment for function * node_last. */ -int rbt_ib_umem_for_each_in_range(struct rb_root *root, +int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 last, umem_call_back cb, void *cookie) @@ -95,7 +95,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root *root, } EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root, +struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) { struct umem_odp_node *node; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 36a6f5c8914c..c1696e6084b2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 64d494a64daf..37c8903e7fd0 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,6 +100,7 @@ struct ib_uverbs_device { struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; struct list_head uverbs_events_file_list; + struct uverbs_root_spec *specs_root; }; struct ib_uverbs_event_queue { @@ -218,6 +219,8 @@ int uverbs_dealloc_mw(struct ib_mw *mw); void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj); +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); + struct ib_uverbs_flow_spec { union { union { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 8ba9bfb073d1..52a2cf2d83aa 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -91,9 +91,10 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err; } - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); if (ret) @@ -117,7 +118,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->closing = 0; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT; + ucontext->umem_tree = RB_ROOT_CACHED; init_rwsem(&ucontext->umem_rwsem); ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); @@ -275,8 +276,14 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.lid = attr.lid; - resp.sm_lid = attr.sm_lid; + + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { + resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); + resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); + } else { + resp.lid = ib_lid_cpu16(attr.lid); + resp.sm_lid = ib_lid_cpu16(attr.sm_lid); + } resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; @@ -313,9 +320,10 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); uobj = uobj_alloc(uobj_get_type(pd), file->ucontext); if (IS_ERR(uobj)) @@ -482,9 +490,10 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); mutex_lock(&file->device->xrcd_tree_mutex); @@ -646,9 +655,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; @@ -740,7 +750,8 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long) cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) return -EINVAL; @@ -1015,7 +1026,7 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; - cq->cq_context = &ev_file->ev_queue; + cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); obj->uobject.object = cq; @@ -1080,7 +1091,8 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, INIT_UDATA(&uhw, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; @@ -1153,7 +1165,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_resize_cq cmd; - struct ib_uverbs_resize_cq_resp resp; + struct ib_uverbs_resize_cq_resp resp = {}; struct ib_udata udata; struct ib_cq *cq; int ret = -EINVAL; @@ -1161,9 +1173,10 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) @@ -1185,7 +1198,8 @@ out: return ret ? ret : in_len; } -static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, + struct ib_wc *wc) { struct ib_uverbs_wc tmp; @@ -1199,7 +1213,10 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; - tmp.slid = wc->slid; + if (rdma_cap_opa_ah(ib_dev, wc->port_num)) + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + else + tmp.slid = ib_lid_cpu16(wc->slid); tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; @@ -1243,7 +1260,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (!ret) break; - ret = copy_wc_to_user(data_ptr, &wc); + ret = copy_wc_to_user(ib_dev, data_ptr, &wc); if (ret) goto out_put; @@ -1296,7 +1313,6 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_cq *cq; struct ib_ucq_object *obj; - struct ib_uverbs_event_queue *ev_queue; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -1313,7 +1329,6 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, */ uverbs_uobject_get(uobj); cq = uobj->object; - ev_queue = cq->cq_context; obj = container_of(cq->uobject, struct ib_ucq_object, uobject); memset(&resp, 0, sizeof(resp)); @@ -1385,8 +1400,9 @@ static int create_qp(struct ib_uverbs_file *file, attr.rwq_ind_tbl = ind_tbl; } - if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + - sizeof(cmd->reserved1)) && cmd->reserved1) { + if (cmd_sz > sizeof(*cmd) && + !ib_is_udata_cleared(ucore, sizeof(*cmd), + cmd_sz - sizeof(*cmd))) { ret = -EOPNOTSUPP; goto err_put; } @@ -1422,7 +1438,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, cmd->srq_handle, file->ucontext); - if (!srq || srq->srq_type != IB_SRQT_BASIC) { + if (!srq || srq->srq_type == IB_SRQT_XRC) { ret = -EINVAL; goto err_put; } @@ -1484,11 +1500,21 @@ static int create_qp(struct ib_uverbs_file *file, IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV | IB_QP_CREATE_SCATTER_FCS | - IB_QP_CREATE_CVLAN_STRIPPING)) { + IB_QP_CREATE_CVLAN_STRIPPING | + IB_QP_CREATE_SOURCE_QPN)) { ret = -EINVAL; goto err_put; } + if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + goto err_put; + } + + attr.source_qpn = cmd->source_qpn; + } + buf = (void *)cmd + sizeof(*cmd); if (cmd_sz > sizeof(*cmd)) if (!(buf[0] == 0 && !memcmp(buf, buf + 1, @@ -1524,6 +1550,7 @@ static int create_qp(struct ib_uverbs_file *file, qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); + qp->port = 0; if (attr.send_cq) atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) @@ -1723,9 +1750,10 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), file->ucontext); @@ -1792,6 +1820,28 @@ err_put: return ret; } +static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr, + struct rdma_ah_attr *rdma_attr) +{ + const struct ib_global_route *grh; + + uverb_attr->dlid = rdma_ah_get_dlid(rdma_attr); + uverb_attr->sl = rdma_ah_get_sl(rdma_attr); + uverb_attr->src_path_bits = rdma_ah_get_path_bits(rdma_attr); + uverb_attr->static_rate = rdma_ah_get_static_rate(rdma_attr); + uverb_attr->is_global = !!(rdma_ah_get_ah_flags(rdma_attr) & + IB_AH_GRH); + if (uverb_attr->is_global) { + grh = rdma_ah_read_grh(rdma_attr); + memcpy(uverb_attr->dgid, grh->dgid.raw, 16); + uverb_attr->flow_label = grh->flow_label; + uverb_attr->sgid_index = grh->sgid_index; + uverb_attr->hop_limit = grh->hop_limit; + uverb_attr->traffic_class = grh->traffic_class; + } + uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr); +} + ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, @@ -1802,7 +1852,6 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; - const struct ib_global_route *grh; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -1852,39 +1901,8 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; - resp.dest.dlid = rdma_ah_get_dlid(&attr->ah_attr); - resp.dest.sl = rdma_ah_get_sl(&attr->ah_attr); - resp.dest.src_path_bits = rdma_ah_get_path_bits(&attr->ah_attr); - resp.dest.static_rate = rdma_ah_get_static_rate(&attr->ah_attr); - resp.dest.is_global = !!(rdma_ah_get_ah_flags(&attr->ah_attr) & - IB_AH_GRH); - if (resp.dest.is_global) { - grh = rdma_ah_read_grh(&attr->ah_attr); - memcpy(resp.dest.dgid, grh->dgid.raw, 16); - resp.dest.flow_label = grh->flow_label; - resp.dest.sgid_index = grh->sgid_index; - resp.dest.hop_limit = grh->hop_limit; - resp.dest.traffic_class = grh->traffic_class; - } - resp.dest.port_num = rdma_ah_get_port_num(&attr->ah_attr); - - resp.alt_dest.dlid = rdma_ah_get_dlid(&attr->alt_ah_attr); - resp.alt_dest.sl = rdma_ah_get_sl(&attr->alt_ah_attr); - resp.alt_dest.src_path_bits = rdma_ah_get_path_bits(&attr->alt_ah_attr); - resp.alt_dest.static_rate - = rdma_ah_get_static_rate(&attr->alt_ah_attr); - resp.alt_dest.is_global - = !!(rdma_ah_get_ah_flags(&attr->alt_ah_attr) & - IB_AH_GRH); - if (resp.alt_dest.is_global) { - grh = rdma_ah_read_grh(&attr->alt_ah_attr); - memcpy(resp.alt_dest.dgid, grh->dgid.raw, 16); - resp.alt_dest.flow_label = grh->flow_label; - resp.alt_dest.sgid_index = grh->sgid_index; - resp.alt_dest.hop_limit = grh->hop_limit; - resp.alt_dest.traffic_class = grh->traffic_class; - } - resp.alt_dest.port_num = rdma_ah_get_port_num(&attr->alt_ah_attr); + copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr); + copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr); resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; @@ -1918,6 +1936,29 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) } } +static void copy_ah_attr_from_uverbs(struct ib_device *dev, + struct rdma_ah_attr *rdma_attr, + struct ib_uverbs_qp_dest *uverb_attr) +{ + rdma_attr->type = rdma_ah_find_type(dev, uverb_attr->port_num); + if (uverb_attr->is_global) { + rdma_ah_set_grh(rdma_attr, NULL, + uverb_attr->flow_label, + uverb_attr->sgid_index, + uverb_attr->hop_limit, + uverb_attr->traffic_class); + rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid); + } else { + rdma_ah_set_ah_flags(rdma_attr, 0); + } + rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid); + rdma_ah_set_sl(rdma_attr, uverb_attr->sl); + rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits); + rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate); + rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num); + rdma_ah_set_make_grd(rdma_attr, false); +} + static int modify_qp(struct ib_uverbs_file *file, struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata) { @@ -1935,7 +1976,8 @@ static int modify_qp(struct ib_uverbs_file *file, goto out; } - if (!rdma_is_port_valid(qp->device, cmd->base.port_num)) { + if ((cmd->base.attr_mask & IB_QP_PORT) && + !rdma_is_port_valid(qp->device, cmd->base.port_num)) { ret = -EINVAL; goto release_qp; } @@ -1963,70 +2005,21 @@ static int modify_qp(struct ib_uverbs_file *file, attr->alt_timeout = cmd->base.alt_timeout; attr->rate_limit = cmd->rate_limit; - attr->ah_attr.type = rdma_ah_find_type(qp->device, - cmd->base.dest.port_num); - if (cmd->base.dest.is_global) { - rdma_ah_set_grh(&attr->ah_attr, NULL, - cmd->base.dest.flow_label, - cmd->base.dest.sgid_index, - cmd->base.dest.hop_limit, - cmd->base.dest.traffic_class); - rdma_ah_set_dgid_raw(&attr->ah_attr, cmd->base.dest.dgid); - } else { - rdma_ah_set_ah_flags(&attr->ah_attr, 0); - } - rdma_ah_set_dlid(&attr->ah_attr, cmd->base.dest.dlid); - rdma_ah_set_sl(&attr->ah_attr, cmd->base.dest.sl); - rdma_ah_set_path_bits(&attr->ah_attr, cmd->base.dest.src_path_bits); - rdma_ah_set_static_rate(&attr->ah_attr, cmd->base.dest.static_rate); - rdma_ah_set_port_num(&attr->ah_attr, - cmd->base.dest.port_num); - - attr->alt_ah_attr.type = rdma_ah_find_type(qp->device, - cmd->base.dest.port_num); - if (cmd->base.alt_dest.is_global) { - rdma_ah_set_grh(&attr->alt_ah_attr, NULL, - cmd->base.alt_dest.flow_label, - cmd->base.alt_dest.sgid_index, - cmd->base.alt_dest.hop_limit, - cmd->base.alt_dest.traffic_class); - rdma_ah_set_dgid_raw(&attr->alt_ah_attr, - cmd->base.alt_dest.dgid); - } else { - rdma_ah_set_ah_flags(&attr->alt_ah_attr, 0); - } + if (cmd->base.attr_mask & IB_QP_AV) + copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr, + &cmd->base.dest); - rdma_ah_set_dlid(&attr->alt_ah_attr, cmd->base.alt_dest.dlid); - rdma_ah_set_sl(&attr->alt_ah_attr, cmd->base.alt_dest.sl); - rdma_ah_set_path_bits(&attr->alt_ah_attr, - cmd->base.alt_dest.src_path_bits); - rdma_ah_set_static_rate(&attr->alt_ah_attr, - cmd->base.alt_dest.static_rate); - rdma_ah_set_port_num(&attr->alt_ah_attr, - cmd->base.alt_dest.port_num); + if (cmd->base.attr_mask & IB_QP_ALT_PATH) + copy_ah_attr_from_uverbs(qp->device, &attr->alt_ah_attr, + &cmd->base.alt_dest); - if (qp->real_qp == qp) { - if (cmd->base.attr_mask & IB_QP_AV) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - goto release_qp; - } - ret = ib_security_modify_qp(qp, - attr, - modify_qp_mask(qp->qp_type, - cmd->base.attr_mask), - udata); - } else { - ret = ib_security_modify_qp(qp, - attr, - modify_qp_mask(qp->qp_type, - cmd->base.attr_mask), - NULL); - } + ret = ib_modify_qp_with_udata(qp, attr, + modify_qp_mask(qp->qp_type, + cmd->base.attr_mask), + udata); release_qp: uobj_put_obj_read(qp); - out: kfree(attr); @@ -2050,7 +2043,8 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, return -EOPNOTSUPP; INIT_UDATA(&udata, buf + sizeof(cmd.base), NULL, - in_len - sizeof(cmd.base), out_len); + in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr), + out_len); ret = modify_qp(file, &cmd, &udata); if (ret) @@ -2103,7 +2097,6 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; struct ib_uobject *uobj; - struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; @@ -2117,7 +2110,6 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, if (IS_ERR(uobj)) return PTR_ERR(uobj); - qp = uobj->object; obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); /* * Make sure we don't free the memory in remove_commit as we still @@ -2558,7 +2550,8 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); uobj = uobj_alloc(uobj_get_type(ah), file->ucontext); if (IS_ERR(uobj)) @@ -2571,6 +2564,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, } attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num); + rdma_ah_set_make_grd(&attr, false); rdma_ah_set_dlid(&attr, cmd.attr.dlid); rdma_ah_set_sl(&attr, cmd.attr.sl); rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits); @@ -3019,7 +3013,6 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, { struct ib_uverbs_ex_destroy_wq cmd = {}; struct ib_uverbs_ex_destroy_wq_resp resp = {}; - struct ib_wq *wq; struct ib_uobject *uobj; struct ib_uwq_object *obj; size_t required_cmd_sz; @@ -3053,7 +3046,6 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, if (IS_ERR(uobj)) return PTR_ERR(uobj); - wq = uobj->object; obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); /* * Make sure we don't free the memory in remove_commit as we still @@ -3489,6 +3481,9 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, if (IS_ERR(obj)) return PTR_ERR(obj); + if (cmd->srq_type == IB_SRQT_TM) + attr.ext.tag_matching.max_num_tags = cmd->max_num_tags; + if (cmd->srq_type == IB_SRQT_XRC) { xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->xrcd_handle, file->ucontext); @@ -3505,10 +3500,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); + } - attr.ext.xrc.cq = uobj_get_obj_read(cq, cmd->cq_handle, - file->ucontext); - if (!attr.ext.xrc.cq) { + if (ib_srq_has_cq(cmd->srq_type)) { + attr.ext.cq = uobj_get_obj_read(cq, cmd->cq_handle, + file->ucontext); + if (!attr.ext.cq) { ret = -EINVAL; goto err_put_xrcd; } @@ -3543,10 +3540,13 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; + if (ib_srq_has_cq(cmd->srq_type)) { + srq->ext.cq = attr.ext.cq; + atomic_inc(&attr.ext.cq->usecnt); + } + if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.cq = attr.ext.xrc.cq; srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.cq->usecnt); atomic_inc(&attr.ext.xrc.xrcd->usecnt); } @@ -3569,10 +3569,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, goto err_copy; } - if (cmd->srq_type == IB_SRQT_XRC) { + if (cmd->srq_type == IB_SRQT_XRC) uobj_put_read(xrcd_uobj); - uobj_put_obj_read(attr.ext.xrc.cq); - } + + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); + uobj_put_obj_read(pd); uobj_alloc_commit(&obj->uevent.uobject); @@ -3585,8 +3587,8 @@ err_put: uobj_put_obj_read(pd); err_put_cq: - if (cmd->srq_type == IB_SRQT_XRC) - uobj_put_obj_read(attr.ext.xrc.cq); + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { @@ -3616,6 +3618,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; + memset(&xcmd, 0, sizeof(xcmd)); xcmd.response = cmd.response; xcmd.user_handle = cmd.user_handle; xcmd.srq_type = IB_SRQT_BASIC; @@ -3624,10 +3627,10 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, xcmd.max_sge = cmd.max_sge; xcmd.srq_limit = cmd.srq_limit; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), - out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) @@ -3651,10 +3654,10 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), - out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) @@ -3743,10 +3746,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; struct ib_uobject *uobj; - struct ib_srq *srq; struct ib_uevent_object *obj; int ret = -EINVAL; - enum ib_srq_type srq_type; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -3756,9 +3757,7 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, if (IS_ERR(uobj)) return PTR_ERR(uobj); - srq = uobj->object; obj = container_of(uobj, struct ib_uevent_object, uobject); - srq_type = srq->srq_type; /* * Make sure we don't free the memory in remove_commit as we still * needs the uobject memory to create the response. @@ -3869,6 +3868,16 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); + + if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps)) + goto end; + + resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size; + resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags; + resp.tm_caps.max_ops = attr.tm_caps.max_ops; + resp.tm_caps.max_sge = attr.tm_caps.max_sge; + resp.tm_caps.flags = attr.tm_caps.flags; + resp.response_length += sizeof(resp.tm_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c new file mode 100644 index 000000000000..5286ad57d903 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/rdma_user_ioctl.h> +#include <rdma/uverbs_ioctl.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_process_attr(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattr, + u16 attr_id, + const struct uverbs_attr_spec_hash *attr_spec_bucket, + struct uverbs_attr_bundle_hash *attr_bundle_h, + struct ib_uverbs_attr __user *uattr_ptr) +{ + const struct uverbs_attr_spec *spec; + struct uverbs_attr *e; + const struct uverbs_object_spec *object; + struct uverbs_obj_attr *o_attr; + struct uverbs_attr *elements = attr_bundle_h->attrs; + + if (uattr->reserved) + return -EINVAL; + + if (attr_id >= attr_spec_bucket->num_attrs) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) + return -EINVAL; + else + return 0; + } + + spec = &attr_spec_bucket->attrs[attr_id]; + e = &elements[attr_id]; + e->uattr = uattr_ptr; + + switch (spec->type) { + case UVERBS_ATTR_TYPE_PTR_IN: + case UVERBS_ATTR_TYPE_PTR_OUT: + if (uattr->len < spec->len || + (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) && + uattr->len > spec->len)) + return -EINVAL; + + e->ptr_attr.data = uattr->data; + e->ptr_attr.len = uattr->len; + e->ptr_attr.flags = uattr->flags; + break; + + case UVERBS_ATTR_TYPE_IDR: + if (uattr->data >> 32) + return -EINVAL; + /* fall through */ + case UVERBS_ATTR_TYPE_FD: + if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX) + return -EINVAL; + + o_attr = &e->obj_attr; + object = uverbs_get_object(ibdev, spec->obj.obj_type); + if (!object) + return -EINVAL; + o_attr->type = object->type_attrs; + + o_attr->id = (int)uattr->data; + o_attr->uobject = uverbs_get_uobject_from_context( + o_attr->type, + ucontext, + spec->obj.access, + o_attr->id); + + if (IS_ERR(o_attr->uobject)) + return PTR_ERR(o_attr->uobject); + + if (spec->obj.access == UVERBS_ACCESS_NEW) { + u64 id = o_attr->uobject->id; + + /* Copy the allocated id to the user-space */ + if (put_user(id, &e->uattr->data)) { + uverbs_finalize_object(o_attr->uobject, + UVERBS_ACCESS_NEW, + false); + return -EFAULT; + } + } + + break; + default: + return -EOPNOTSUPP; + } + + set_bit(attr_id, attr_bundle_h->valid_bitmap); + return 0; +} + +static int uverbs_uattrs_process(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + const struct uverbs_method_spec *method, + struct uverbs_attr_bundle *attr_bundle, + struct ib_uverbs_attr __user *uattr_ptr) +{ + size_t i; + int ret = 0; + int num_given_buckets = 0; + + for (i = 0; i < num_uattrs; i++) { + const struct ib_uverbs_attr *uattr = &uattrs[i]; + u16 attr_id = uattr->attr_id; + struct uverbs_attr_spec_hash *attr_spec_bucket; + + ret = uverbs_ns_idx(&attr_id, method->num_buckets); + if (ret < 0) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + continue; + } + + /* + * ret is the found ns, so increase num_given_buckets if + * necessary. + */ + if (ret >= num_given_buckets) + num_given_buckets = ret + 1; + + attr_spec_bucket = method->attr_buckets[ret]; + ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id, + attr_spec_bucket, &attr_bundle->hash[ret], + uattr_ptr++); + if (ret) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + } + + return num_given_buckets; +} + +static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + unsigned int i; + + for (i = 0; i < attr_bundle->num_buckets; i++) { + struct uverbs_attr_spec_hash *attr_spec_bucket = + method_spec->attr_buckets[i]; + + if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask, + attr_bundle->hash[i].valid_bitmap, + attr_spec_bucket->num_attrs)) + return -EINVAL; + } + + return 0; +} + +static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + struct ib_device *ibdev, + struct ib_uverbs_file *ufile, + const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + int ret; + int finalize_ret; + int num_given_buckets; + + num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs, + num_uattrs, method_spec, + attr_bundle, uattr_ptr); + if (num_given_buckets <= 0) + return -EINVAL; + + attr_bundle->num_buckets = num_given_buckets; + ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle); + if (ret) + goto cleanup; + + ret = method_spec->handler(ibdev, ufile, attr_bundle); +cleanup: + finalize_ret = uverbs_finalize_objects(attr_bundle, + method_spec->attr_buckets, + attr_bundle->num_buckets, + !ret); + + return ret ? ret : finalize_ret; +} + +#define UVERBS_OPTIMIZE_USING_STACK_SZ 256 +static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct ib_uverbs_ioctl_hdr *hdr, + void __user *buf) +{ + const struct uverbs_object_spec *object_spec; + const struct uverbs_method_spec *method_spec; + long err = 0; + unsigned int i; + struct { + struct ib_uverbs_attr *uattrs; + struct uverbs_attr_bundle *uverbs_attr_bundle; + } *ctx = NULL; + struct uverbs_attr *curr_attr; + unsigned long *curr_bitmap; + size_t ctx_size; +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; +#endif + + if (hdr->reserved) + return -EINVAL; + + object_spec = uverbs_get_object(ib_dev, hdr->object_id); + if (!object_spec) + return -EOPNOTSUPP; + + method_spec = uverbs_get_method(object_spec, hdr->method_id); + if (!method_spec) + return -EOPNOTSUPP; + + if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) + return -EINVAL; + + ctx_size = sizeof(*ctx) + + sizeof(struct uverbs_attr_bundle) + + sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets + + sizeof(*ctx->uattrs) * hdr->num_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) * + method_spec->num_child_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) * + (method_spec->num_child_attrs / BITS_PER_LONG + + method_spec->num_buckets); + +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ) + ctx = (void *)data; + + if (!ctx) +#endif + ctx = kmalloc(ctx_size, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx); + ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) + + (sizeof(ctx->uverbs_attr_bundle->hash[0]) * + method_spec->num_buckets); + curr_attr = (void *)(ctx->uattrs + hdr->num_attrs); + curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs); + + /* + * We just fill the pointers and num_attrs here. The data itself will be + * filled at a later stage (uverbs_process_attr) + */ + for (i = 0; i < method_spec->num_buckets; i++) { + unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs; + + ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr; + curr_attr += curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap; + bitmap_zero(curr_bitmap, curr_num_attrs); + curr_bitmap += BITS_TO_LONGS(curr_num_attrs); + } + + err = copy_from_user(ctx->uattrs, buf, + sizeof(*ctx->uattrs) * hdr->num_attrs); + if (err) { + err = -EFAULT; + goto out; + } + + err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, + file, method_spec, ctx->uverbs_attr_bundle); +out: +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size > UVERBS_OPTIMIZE_USING_STACK_SZ) +#endif + kfree(ctx); + return err; +} + +#define IB_UVERBS_MAX_CMD_SZ 4096 + +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_ioctl_hdr __user *user_hdr = + (struct ib_uverbs_ioctl_hdr __user *)arg; + struct ib_uverbs_ioctl_hdr hdr; + struct ib_device *ib_dev; + int srcu_key; + long err; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + err = -EIO; + goto out; + } + + if (cmd == RDMA_VERBS_IOCTL) { + err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + + if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ || + hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) { + err = -EINVAL; + goto out; + } + + if (hdr.reserved) { + err = -EOPNOTSUPP; + goto out; + } + + err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr, + (__user void *)arg + sizeof(hdr)); + } else { + err = -ENOIOCTLCMD; + } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + + return err; +} diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c new file mode 100644 index 000000000000..76ddb6564578 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl_merge.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> +#include <linux/bitops.h> +#include "uverbs.h" + +#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT) +#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT) +#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK) + +#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset, \ + buckets_offset) \ + for (tmpj = 0, \ + elem = (*(const void ***)((hashes)[tmpi] + \ + (buckets_offset)))[0]; \ + tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset)); \ + tmpj++) \ + if ((elem = ((*(const void ***)(hashes[tmpi] + \ + (buckets_offset)))[tmpj]))) + +/* + * Iterate all elements of a few @hashes. The number of given hashes is + * indicated by @num_hashes. The offset of the number of buckets in the hash is + * represented by @num_buckets_offset, while the offset of the buckets array in + * the hash structure is represented by @buckets_offset. tmpi and tmpj are two + * short (or int) based indices that are given by the user. tmpi iterates over + * the different hashes. @elem points the current element in the hashes[tmpi] + * bucket we are looping on. To be honest, @hashes representation isn't exactly + * a hash, but more a collection of elements. These elements' ids are treated + * in a hash like manner, where the first upper bits are the bucket number. + * These elements are later mapped into a perfect-hash. + */ +#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes, \ + num_buckets_offset, buckets_offset) \ + for (tmpi = 0; tmpi < (num_hashes); tmpi++) \ + _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\ + buckets_offset) + +#define get_elements_iterators_entry_above(iters, num_elements, elements, \ + num_objects_fld, objects_fld, bucket,\ + min_id) \ + get_elements_above_id((const void **)iters, num_elements, \ + (const void **)(elements), \ + offsetof(typeof(**elements), \ + num_objects_fld), \ + offsetof(typeof(**elements), objects_fld),\ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket, min_id) + +#define get_objects_above_id(iters, num_trees, trees, bucket, min_id) \ + get_elements_iterators_entry_above(iters, num_trees, trees, \ + num_objects, objects, bucket, min_id) + +#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(method_iters, num_iters, iters, \ + num_methods, methods, bucket, min_id) + +#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(attrs_iters, num_iters, iters, \ + num_attrs, attrs, bucket, min_id) + +/* + * get_elements_above_id get a few hashes represented by @elements and + * @num_elements. The hashes fields are described by @num_offset, @data_offset + * and @id_offset in the same way as required by for_each_element. The function + * returns an array of @iters, represents an array of elements in the hashes + * buckets, which their ids are the smallest ids in all hashes but are all + * larger than the id given by min_id. Elements are only added to the iters + * array if their id belongs to the bucket @bucket. The number of elements in + * the returned array is returned by the function. @min_id is also updated to + * reflect the new min_id of all elements in iters. + */ +static size_t get_elements_above_id(const void **iters, + unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket, + short *min_id) +{ + size_t num_iters = 0; + short min = SHRT_MAX; + const void *elem; + int i, j, last_stored = -1; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) != bucket) + continue; + + if (GET_ID(id) < *min_id || + (min != SHRT_MAX && GET_ID(id) > min)) + continue; + + /* + * We first iterate all hashes represented by @elements. When + * we do, we try to find an element @elem in the bucket @bucket + * which its id is min. Since we can't ensure the user sorted + * the elements in increasing order, we override this hash's + * minimal id element we found, if a new element with a smaller + * id was just found. + */ + iters[last_stored == i ? num_iters - 1 : num_iters++] = elem; + last_stored = i; + min = GET_ID(id); + } + + /* + * We only insert to our iters array an element, if its id is smaller + * than all previous ids. Therefore, the final iters array is sorted so + * that smaller ids are in the end of the array. + * Therefore, we need to clean the beginning of the array to make sure + * all ids of final elements are equal to min. + */ + for (i = num_iters - 1; i >= 0 && + GET_ID(*(u16 *)(iters[i] + id_offset)) == min; i--) + ; + + num_iters -= i + 1; + memmove(iters, iters + i + 1, sizeof(*iters) * num_iters); + + *min_id = min; + return num_iters; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +static short find_max_element_ns_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset) +{ + short max_ns = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) > max_ns) + max_ns = GET_NS_ID(id); + } + + return max_ns; +} + +static short find_max_element_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket) +{ + short max_id = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) == bucket && + GET_ID(id) > max_id) + max_id = GET_ID(id); + } + return max_id; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +#define find_max_element_ns_entry_id(num_elements, elements, \ + num_objects_fld, objects_fld) \ + find_max_element_ns_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld),\ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id)) + +/* + * find_max_xxxx_ns_id gets a few elements. Each element is described by an id + * which its upper bits represents a namespace. It finds the max namespace. This + * could be used in order to know how many buckets do we need to allocate. If no + * elements exist, SHRT_MIN is returned. Namespace represents here different + * buckets. The common example is "common bucket" and "driver bucket". + * + * find_max_xxxx_id gets a few elements and a bucket. Each element is described + * by an id which its upper bits represent a namespace. It returns the max id + * which is contained in the same namespace defined in @bucket. This could be + * used in order to know how many elements do we need to allocate in the bucket. + * If no elements exist, SHRT_MIN is returned. + */ + +#define find_max_object_id(num_trees, trees, bucket) \ + find_max_element_entry_id(num_trees, trees, num_objects,\ + objects, bucket) +#define find_max_object_ns_id(num_trees, trees) \ + find_max_element_ns_entry_id(num_trees, trees, \ + num_objects, objects) + +#define find_max_method_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_methods,\ + methods, bucket) +#define find_max_method_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_methods, methods) + +#define find_max_attr_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_attrs, \ + attrs, bucket) +#define find_max_attr_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_attrs, attrs) + +static void free_method(struct uverbs_method_spec *method) +{ + unsigned int i; + + if (!method) + return; + + for (i = 0; i < method->num_buckets; i++) + kfree(method->attr_buckets[i]); + + kfree(method); +} + +#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \ + (attr)->type == UVERBS_ATTR_TYPE_FD) + +/* + * This function gets array of size @num_method_defs which contains pointers to + * method definitions @method_defs. The function allocates an + * uverbs_method_spec structure and initializes its number of buckets and the + * elements in buckets to the correct attributes. While doing that, it + * validates that there aren't conflicts between attributes of different + * method_defs. + */ +static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs, + size_t num_method_defs) +{ + int bucket_idx; + int max_attr_buckets = 0; + size_t num_attr_buckets = 0; + int res = 0; + struct uverbs_method_spec *method = NULL; + const struct uverbs_attr_def **attr_defs; + unsigned int num_of_singularities = 0; + + max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs); + if (max_attr_buckets >= 0) + num_attr_buckets = max_attr_buckets + 1; + + method = kzalloc(sizeof(*method) + + num_attr_buckets * sizeof(*method->attr_buckets), + GFP_KERNEL); + if (!method) + return ERR_PTR(-ENOMEM); + + method->num_buckets = num_attr_buckets; + attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL); + if (!attr_defs) { + res = -ENOMEM; + goto free_method; + } + for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int attr_max_bucket = 0; + struct uverbs_attr_spec_hash *hash = NULL; + + attr_max_bucket = find_max_attr_id(num_method_defs, method_defs, + bucket_idx); + if (attr_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1), + sizeof(long)) + + BITS_TO_LONGS(attr_max_bucket) * sizeof(long), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_attrs = attr_max_bucket + 1; + method->num_child_attrs += hash->num_attrs; + hash->mandatory_attrs_bitmask = (void *)(hash + 1) + + ALIGN(sizeof(*hash->attrs) * + (attr_max_bucket + 1), + sizeof(long)); + + method->attr_buckets[bucket_idx] = hash; + + do { + size_t num_attr_defs; + struct uverbs_attr_spec *attr; + bool attr_obj_with_special_access; + + num_attr_defs = + get_attrs_above_id(attr_defs, + num_method_defs, + method_defs, + bucket_idx, + &min_id); + /* Last attr in bucket */ + if (!num_attr_defs) + break; + + if (num_attr_defs > 1) { + /* + * We don't allow two attribute definitions for + * the same attribute. This is usually a + * programmer error. If required, it's better to + * just add a new attribute to capture the new + * semantics. + */ + res = -EEXIST; + goto free; + } + + attr = &hash->attrs[min_id]; + memcpy(attr, &attr_defs[0]->attr, sizeof(*attr)); + + attr_obj_with_special_access = IS_ATTR_OBJECT(attr) && + (attr->obj.access == UVERBS_ACCESS_NEW || + attr->obj.access == UVERBS_ACCESS_DESTROY); + num_of_singularities += !!attr_obj_with_special_access; + if (WARN(num_of_singularities > 1, + "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n", + min_id) || + WARN(attr_obj_with_special_access && + !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY), + "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy aceess but isn't mandatory\n", + min_id) || + WARN(IS_ATTR_OBJECT(attr) && + attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ, + "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n", + min_id)) { + res = -EINVAL; + goto free; + } + + if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY) + set_bit(min_id, hash->mandatory_attrs_bitmask); + min_id++; + + } while (1); + } + kfree(attr_defs); + return method; + +free: + kfree(attr_defs); +free_method: + free_method(method); + return ERR_PTR(res); +} + +static void free_object(struct uverbs_object_spec *object) +{ + unsigned int i, j; + + if (!object) + return; + + for (i = 0; i < object->num_buckets; i++) { + struct uverbs_method_spec_hash *method_buckets = + object->method_buckets[i]; + + if (!method_buckets) + continue; + + for (j = 0; j < method_buckets->num_methods; j++) + free_method(method_buckets->methods[j]); + + kfree(method_buckets); + } + + kfree(object); +} + +/* + * This function gets array of size @num_object_defs which contains pointers to + * object definitions @object_defs. The function allocated an + * uverbs_object_spec structure and initialize its number of buckets and the + * elements in buckets to the correct methods. While doing that, it + * sorts out the correct relationship between conflicts in the same method. + */ +static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs, + size_t num_object_defs) +{ + u16 bucket_idx; + int max_method_buckets = 0; + u16 num_method_buckets = 0; + int res = 0; + struct uverbs_object_spec *object = NULL; + const struct uverbs_method_def **method_defs; + + max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs); + if (max_method_buckets >= 0) + num_method_buckets = max_method_buckets + 1; + + object = kzalloc(sizeof(*object) + + num_method_buckets * + sizeof(*object->method_buckets), GFP_KERNEL); + if (!object) + return ERR_PTR(-ENOMEM); + + object->num_buckets = num_method_buckets; + method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL); + if (!method_defs) { + res = -ENOMEM; + goto free_object; + } + + for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int methods_max_bucket = 0; + struct uverbs_method_spec_hash *hash = NULL; + + methods_max_bucket = find_max_method_id(num_object_defs, object_defs, + bucket_idx); + if (methods_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->methods) * (methods_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + + hash->num_methods = methods_max_bucket + 1; + object->method_buckets[bucket_idx] = hash; + + do { + size_t num_method_defs; + struct uverbs_method_spec *method; + int i; + + num_method_defs = + get_methods_above_id(method_defs, + num_object_defs, + object_defs, + bucket_idx, + &min_id); + /* Last method in bucket */ + if (!num_method_defs) + break; + + method = build_method_with_attrs(method_defs, + num_method_defs); + if (IS_ERR(method)) { + res = PTR_ERR(method); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous method handler. + * Therefore, we iterate backwards and search for the + * first handler which != NULL. This also defines the + * set of flags used for this handler. + */ + for (i = num_object_defs - 1; + i >= 0 && !method_defs[i]->handler; i--) + ; + hash->methods[min_id++] = method; + /* NULL handler isn't allowed */ + if (WARN(i < 0, + "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n", + min_id)) { + res = -EINVAL; + goto free; + } + method->handler = method_defs[i]->handler; + method->flags = method_defs[i]->flags; + + } while (1); + } + kfree(method_defs); + return object; + +free: + kfree(method_defs); +free_object: + free_object(object); + return ERR_PTR(res); +} + +void uverbs_free_spec_tree(struct uverbs_root_spec *root) +{ + unsigned int i, j; + + if (!root) + return; + + for (i = 0; i < root->num_buckets; i++) { + struct uverbs_object_spec_hash *object_hash = + root->object_buckets[i]; + + if (!object_hash) + continue; + + for (j = 0; j < object_hash->num_objects; j++) + free_object(object_hash->objects[j]); + + kfree(object_hash); + } + + kfree(root); +} +EXPORT_SYMBOL(uverbs_free_spec_tree); + +struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees) +{ + u16 bucket_idx; + short max_object_buckets = 0; + size_t num_objects_buckets = 0; + struct uverbs_root_spec *root_spec = NULL; + const struct uverbs_object_def **object_defs; + int i; + int res = 0; + + max_object_buckets = find_max_object_ns_id(num_trees, trees); + /* + * Devices which don't want to support ib_uverbs, should just allocate + * an empty parsing tree. Every user-space command won't hit any valid + * entry in the parsing tree and thus will fail. + */ + if (max_object_buckets >= 0) + num_objects_buckets = max_object_buckets + 1; + + root_spec = kzalloc(sizeof(*root_spec) + + num_objects_buckets * sizeof(*root_spec->object_buckets), + GFP_KERNEL); + if (!root_spec) + return ERR_PTR(-ENOMEM); + root_spec->num_buckets = num_objects_buckets; + + object_defs = kcalloc(num_trees, sizeof(*object_defs), + GFP_KERNEL); + if (!object_defs) { + res = -ENOMEM; + goto free_root; + } + + for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + short objects_max_bucket; + struct uverbs_object_spec_hash *hash = NULL; + + objects_max_bucket = find_max_object_id(num_trees, trees, + bucket_idx); + if (objects_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->objects) * (objects_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_objects = objects_max_bucket + 1; + root_spec->object_buckets[bucket_idx] = hash; + + do { + size_t num_object_defs; + struct uverbs_object_spec *object; + + num_object_defs = get_objects_above_id(object_defs, + num_trees, + trees, + bucket_idx, + &min_id); + /* Last object in bucket */ + if (!num_object_defs) + break; + + object = build_object_with_methods(object_defs, + num_object_defs); + if (IS_ERR(object)) { + res = PTR_ERR(object); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous object's type_attrs. + * Therefore, we iterate backwards and search for the + * first type_attrs which != NULL. + */ + for (i = num_object_defs - 1; + i >= 0 && !object_defs[i]->type_attrs; i--) + ; + /* + * NULL is a valid type_attrs. It means an object we + * can't instantiate (like DEVICE). + */ + object->type_attrs = i < 0 ? NULL : + object_defs[i]->type_attrs; + + hash->objects[min_id++] = object; + } while (1); + } + + kfree(object_defs); + return root_spec; + +free: + kfree(object_defs); +free_root: + uverbs_free_spec_tree(root_spec); + return ERR_PTR(res); +} +EXPORT_SYMBOL(uverbs_alloc_spec_tree); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 3d2609608f58..dc2aed6fb21b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -49,6 +49,7 @@ #include <linux/uaccess.h> #include <rdma/ib.h> +#include <rdma/uverbs_std_types.h> #include "uverbs.h" #include "core_priv.h" @@ -250,6 +251,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); + kobject_put(&file->device->kobj); kfree(file); } @@ -594,7 +596,6 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file { struct ib_uverbs_async_event_file *ev_file; struct file *filp; - int ret; ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) @@ -620,21 +621,11 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, ib_dev, ib_uverbs_event_handler); - ret = ib_register_event_handler(&uverbs_file->event_handler); - if (ret) - goto err_put_file; - + ib_register_event_handler(&uverbs_file->event_handler); /* At that point async file stuff was fully set */ return filp; -err_put_file: - fput(filp); - kref_put(&uverbs_file->async_file->ref, - ib_uverbs_release_async_event_file); - uverbs_file->async_file = NULL; - return ERR_PTR(ret); - err_put_refs: kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&ev_file->ref, ib_uverbs_release_async_event_file); @@ -917,7 +908,6 @@ err: static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; - struct ib_uverbs_device *dev = file->device; mutex_lock(&file->cleanup_mutex); if (file->ucontext) { @@ -939,7 +929,6 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) ib_uverbs_release_async_event_file); kref_put(&file->ref, ib_uverbs_release_file); - kobject_put(&dev->kobj); return 0; } @@ -950,6 +939,9 @@ static const struct file_operations uverbs_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) + .unlocked_ioctl = ib_uverbs_ioctl, +#endif }; static const struct file_operations uverbs_mmap_fops = { @@ -959,6 +951,9 @@ static const struct file_operations uverbs_mmap_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) + .unlocked_ioctl = ib_uverbs_ioctl, +#endif }; static struct ib_client uverbs_client = { @@ -1109,6 +1104,18 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + if (!device->specs_root) { + const struct uverbs_object_tree_def *default_root[] = { + uverbs_default_get_objects()}; + + uverbs_dev->specs_root = uverbs_alloc_spec_tree(1, + default_root); + if (IS_ERR(uverbs_dev->specs_root)) + goto err_class; + + device->specs_root = uverbs_dev->specs_root; + } + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1154,7 +1161,6 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, kref_get(&file->ref); mutex_unlock(&uverbs_dev->lists_mutex); - ib_uverbs_event_handler(&file->event_handler, &event); mutex_lock(&file->cleanup_mutex); ucontext = file->ucontext; @@ -1171,6 +1177,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, * for example due to freeing the resources * (e.g mmput). */ + ib_uverbs_event_handler(&file->event_handler, &event); ib_dev->disassociate_ucontext(ucontext); mutex_lock(&file->cleanup_mutex); ib_uverbs_cleanup_ucontext(file, ucontext, true); @@ -1240,6 +1247,11 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); + if (uverbs_dev->specs_root) { + uverbs_free_spec_tree(uverbs_dev->specs_root); + device->specs_root = NULL; + } + kobject_put(&uverbs_dev->kobj); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 94fd989c9060..bd0acf376af0 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -33,10 +33,47 @@ #include <linux/export.h> #include <rdma/ib_marshall.h> -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct rdma_ah_attr *src) +#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +static int rdma_ah_conv_opa_to_ib(struct ib_device *dev, + struct rdma_ah_attr *ib, + struct rdma_ah_attr *opa) { + struct ib_port_attr port_attr; + int ret = 0; + + /* Do structure copy and the over-write fields */ + *ib = *opa; + + ib->type = RDMA_AH_ATTR_TYPE_IB; + rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0); + + if (ib_query_port(dev, opa->port_num, &port_attr)) { + /* Set to default subnet to indicate error */ + rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX); + ret = -EINVAL; + } else { + rdma_ah_set_subnet_prefix(ib, + cpu_to_be64(port_attr.subnet_prefix)); + } + rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa))); + return ret; +} + +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, + struct rdma_ah_attr *ah_attr) +{ + struct rdma_ah_attr *src = ah_attr; + struct rdma_ah_attr conv_ah; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr))) + src = &conv_ah; + dst->dlid = rdma_ah_get_dlid(src); dst->sl = rdma_ah_get_sl(src); dst->src_path_bits = rdma_ah_get_path_bits(src); @@ -57,7 +94,8 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { dst->qp_state = src->qp_state; @@ -76,8 +114,8 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->max_recv_sge = src->cap.max_recv_sge; dst->max_inline_data = src->cap.max_inline_data; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); - ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr); dst->pkey_index = src->pkey_index; dst->alt_pkey_index = src->alt_pkey_index; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index ef293379f37a..0a98579700ec 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -209,67 +209,244 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; -const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel = { - .type = UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), 0), - .context_closed = uverbs_hot_unplug_completion_event_file, - .fops = &uverbs_event_fops, - .name = "[infinibandevent]", - .flags = O_RDONLY, -}; +/* + * This spec is used in order to pass information to the hardware driver in a + * legacy way. Every verb that could get driver specific data should get this + * spec. + */ +static const struct uverbs_attr_def uverbs_uhw_compat_in = + UVERBS_ATTR_PTR_IN_SZ(UVERBS_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); +static const struct uverbs_attr_def uverbs_uhw_compat_out = + UVERBS_ATTR_PTR_OUT_SZ(UVERBS_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); -const struct uverbs_obj_idr_type uverbs_type_attrs_cq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0), - .destroy_object = uverbs_free_cq, -}; +static void create_udata(struct uverbs_attr_bundle *ctx, + struct ib_udata *udata) +{ + /* + * This is for ease of conversion. The purpose is to convert all drivers + * to use uverbs_attr_bundle instead of ib_udata. + * Assume attr == 0 is input and attr == 1 is output. + */ + void __user *inbuf; + size_t inbuf_len = 0; + void __user *outbuf; + size_t outbuf_len = 0; + const struct uverbs_attr *uhw_in = + uverbs_attr_get(ctx, UVERBS_UHW_IN); + const struct uverbs_attr *uhw_out = + uverbs_attr_get(ctx, UVERBS_UHW_OUT); + + if (!IS_ERR(uhw_in)) { + inbuf = uhw_in->ptr_attr.ptr; + inbuf_len = uhw_in->ptr_attr.len; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_qp = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0), - .destroy_object = uverbs_free_qp, -}; + if (!IS_ERR(uhw_out)) { + outbuf = uhw_out->ptr_attr.ptr; + outbuf_len = uhw_out->ptr_attr.len; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_mw = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_mw, -}; + INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len); +} -const struct uverbs_obj_idr_type uverbs_type_attrs_mr = { - /* 1 is used in order to free the MR after all the MWs */ - .type = UVERBS_TYPE_ALLOC_IDR(1), - .destroy_object = uverbs_free_mr, -}; +static int uverbs_create_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = file->ucontext; + struct ib_ucq_object *obj; + struct ib_udata uhw; + int ret; + u64 user_handle; + struct ib_cq_init_attr attr = {}; + struct ib_cq *cq; + struct ib_uverbs_completion_event_file *ev_file = NULL; + const struct uverbs_attr *ev_file_attr; + struct ib_uobject *ev_file_uobj; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.comp_vector, attrs, CREATE_CQ_COMP_VECTOR); + if (!ret) + ret = uverbs_copy_from(&attr.cqe, attrs, CREATE_CQ_CQE); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, CREATE_CQ_USER_HANDLE); + if (ret) + return ret; -const struct uverbs_obj_idr_type uverbs_type_attrs_srq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0), - .destroy_object = uverbs_free_srq, -}; + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + if (uverbs_copy_from(&attr.flags, attrs, CREATE_CQ_FLAGS) == -EFAULT) + return -EFAULT; -const struct uverbs_obj_idr_type uverbs_type_attrs_ah = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_ah, -}; + ev_file_attr = uverbs_attr_get(attrs, CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_attr)) { + ev_file_uobj = ev_file_attr->obj_attr.uobject; -const struct uverbs_obj_idr_type uverbs_type_attrs_flow = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_flow, -}; + ev_file = container_of(ev_file_uobj, + struct ib_uverbs_completion_event_file, + uobj_file.uobj); + uverbs_uobject_get(ev_file_uobj); + } -const struct uverbs_obj_idr_type uverbs_type_attrs_wq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0), - .destroy_object = uverbs_free_wq, -}; + if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { + ret = -EINVAL; + goto err_event_file; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_rwq_ind_tbl, -}; + obj = container_of(uverbs_attr_get(attrs, CREATE_CQ_HANDLE)->obj_attr.uobject, + typeof(*obj), uobject); + obj->uverbs_file = ucontext->ufile; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); + + /* Temporary, only until drivers get the new uverbs_attr_bundle */ + create_udata(attrs, &uhw); + + cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto err_event_file; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0), - .destroy_object = uverbs_free_xrcd, -}; + cq->device = ib_dev; + cq->uobject = &obj->uobject; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = &ev_file->ev_queue; + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; + atomic_set(&cq->usecnt, 0); + + ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); + if (ret) + goto err_cq; -const struct uverbs_obj_idr_type uverbs_type_attrs_pd = { - /* 2 is used in order to free the PD after MRs */ - .type = UVERBS_TYPE_ALLOC_IDR(2), - .destroy_object = uverbs_free_pd, + return 0; +err_cq: + ib_destroy_cq(cq); + +err_event_file: + if (ev_file) + uverbs_uobject_put(ev_file_uobj); + return ret; }; + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_create, UVERBS_CQ_CREATE, uverbs_create_cq_handler, + &UVERBS_ATTR_IDR(CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_USER_HANDLE, u64, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_FD(CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_COMP_VECTOR, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_FLAGS, u32), + &UVERBS_ATTR_PTR_OUT(CREATE_CQ_RESP_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); + +static int uverbs_destroy_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, DESTROY_CQ_HANDLE)->obj_attr.uobject; + struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, + uobject); + int ret; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) + return -EOPNOTSUPP; + + ret = rdma_explicit_destroy(uobj); + if (ret) + return ret; + + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; + + return uverbs_copy_to(attrs, DESTROY_CQ_RESP, &resp); +} + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_destroy, UVERBS_CQ_DESTROY, uverbs_destroy_cq_handler, + &UVERBS_ATTR_IDR(DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(DESTROY_CQ_RESP, struct ib_uverbs_destroy_cq_resp, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, + UVERBS_OBJECT_COMP_CHANNEL, + &UVERBS_TYPE_ALLOC_FD(0, + sizeof(struct ib_uverbs_completion_event_file), + uverbs_hot_unplug_completion_event_file, + &uverbs_event_fops, + "[infinibandevent]", O_RDONLY)); + +DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, + uverbs_free_cq), + &uverbs_method_cq_create, + &uverbs_method_cq_destroy); + +DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, + uverbs_free_qp)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mw, UVERBS_OBJECT_MW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mr, UVERBS_OBJECT_MR, + /* 1 is used in order to free the MR after all the MWs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); + +DECLARE_UVERBS_OBJECT(uverbs_object_srq, UVERBS_OBJECT_SRQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, + uverbs_free_srq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_ah, UVERBS_OBJECT_AH, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); + +DECLARE_UVERBS_OBJECT(uverbs_object_flow, UVERBS_OBJECT_FLOW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); + +DECLARE_UVERBS_OBJECT(uverbs_object_wq, UVERBS_OBJECT_WQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, + uverbs_free_wq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_rwq_ind_table, + UVERBS_OBJECT_RWQ_IND_TBL, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); + +DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, + uverbs_free_xrcd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD, + /* 2 is used in order to free the PD after MRs */ + &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_device, UVERBS_OBJECT_DEVICE, NULL); + +DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, + &uverbs_object_device, + &uverbs_object_pd, + &uverbs_object_mr, + &uverbs_object_comp_channel, + &uverbs_object_cq, + &uverbs_object_qp, + &uverbs_object_ah, + &uverbs_object_mw, + &uverbs_object_srq, + &uverbs_object_flow, + &uverbs_object_wq, + &uverbs_object_rwq_ind_table, + &uverbs_object_xrcd); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c973a83c898b..de57d6c11a25 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -180,39 +180,29 @@ EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { - switch (node_type) { - case RDMA_NODE_IB_CA: - case RDMA_NODE_IB_SWITCH: - case RDMA_NODE_IB_ROUTER: - return RDMA_TRANSPORT_IB; - case RDMA_NODE_RNIC: - return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_USNIC: + + if (node_type == RDMA_NODE_USNIC) return RDMA_TRANSPORT_USNIC; - case RDMA_NODE_USNIC_UDP: + if (node_type == RDMA_NODE_USNIC_UDP) return RDMA_TRANSPORT_USNIC_UDP; - default: - BUG(); - return 0; - } + if (node_type == RDMA_NODE_RNIC) + return RDMA_TRANSPORT_IWARP; + + return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) { + enum rdma_transport_type lt; if (device->get_link_layer) return device->get_link_layer(device, port_num); - switch (rdma_node_get_transport(device->node_type)) { - case RDMA_TRANSPORT_IB: + lt = rdma_node_get_transport(device->node_type); + if (lt == RDMA_TRANSPORT_IB) return IB_LINK_LAYER_INFINIBAND; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_USNIC: - case RDMA_TRANSPORT_USNIC_UDP: - return IB_LINK_LAYER_ETHERNET; - default: - return IB_LINK_LAYER_UNSPECIFIED; - } + + return IB_LINK_LAYER_ETHERNET; } EXPORT_SYMBOL(rdma_port_get_link_layer); @@ -452,6 +442,19 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); +/* + * This function creates ah from the incoming packet. + * Incoming packet has dgid of the receiver node on which this code is + * getting executed and, sgid contains the GID of the sender. + * + * When resolving mac address of destination, the arrived dgid is used + * as sgid and, sgid is used as dgid because sgid contains destinations + * GID whom to respond to. + * + * This is why when calling rdma_addr_find_l2_eth_by_grh() function, the + * position of arguments dgid and sgid do not match the order of the + * parameters. + */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr) @@ -465,6 +468,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, union ib_gid dgid; union ib_gid sgid; + might_sleep(); + memset(ah_attr, 0, sizeof *ah_attr); ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { @@ -507,11 +512,6 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } resolved_dev = dev_get_by_index(&init_net, if_index); - if (resolved_dev->flags & IFF_LOOPBACK) { - dev_put(resolved_dev); - resolved_dev = idev; - dev_hold(resolved_dev); - } rcu_read_lock(); if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, resolved_dev)) @@ -624,11 +624,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -669,18 +671,18 @@ int ib_destroy_srq(struct ib_srq *srq) pd = srq->pd; srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { + if (ib_srq_has_cq(srq_type)) + cq = srq->ext.cq; + if (srq_type == IB_SRQT_XRC) xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { + if (srq_type == IB_SRQT_XRC) atomic_dec(&xrcd->usecnt); + if (ib_srq_has_cq(srq_type)) atomic_dec(&cq->usecnt); - } } return ret; @@ -830,6 +832,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, spin_lock_init(&qp->mr_lock); INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + qp->port = 0; if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) return ib_create_xrc_qp(qp, qp_init_attr); @@ -1235,6 +1238,18 @@ int ib_resolve_eth_dmac(struct ib_device *device, if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); + return 0; + } + if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { + if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { + __be32 addr = 0; + + memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); + ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); + } else { + ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, + (char *)ah_attr->roce.dmac); + } } else { union ib_gid sgid; struct ib_gid_attr sgid_attr; @@ -1268,20 +1283,95 @@ out: } EXPORT_SYMBOL(ib_resolve_eth_dmac); -int ib_modify_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask) +/** + * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. + * @qp: The QP to modify. + * @attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + * @udata: pointer to user's input output buffer information + * are being modified. + * It returns 0 on success and returns appropriate error code on error. + */ +int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) { + int ret; - if (qp_attr_mask & IB_QP_AV) { - int ret; - - ret = ib_resolve_eth_dmac(qp->device, &qp_attr->ah_attr); + if (attr_mask & IB_QP_AV) { + ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; } + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); + if (!ret && (attr_mask & IB_QP_PORT)) + qp->port = attr->port_num; + + return ret; +} +EXPORT_SYMBOL(ib_modify_qp_with_udata); + +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +{ + int rc; + u32 netdev_speed; + struct net_device *netdev; + struct ethtool_link_ksettings lksettings; + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + if (!dev->get_netdev) + return -EOPNOTSUPP; + + netdev = dev->get_netdev(dev, port_num); + if (!netdev) + return -ENODEV; + + rtnl_lock(); + rc = __ethtool_get_link_ksettings(netdev, &lksettings); + rtnl_unlock(); + + dev_put(netdev); + + if (!rc) { + netdev_speed = lksettings.base.speed; + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, + netdev_speed); + } + + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } - return ib_security_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); + return 0; +} +EXPORT_SYMBOL(ib_get_eth_speed); + +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -1544,15 +1634,53 @@ EXPORT_SYMBOL(ib_dealloc_fmr); /* Multicast groups */ +static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_qp_attr attr = {}; + int num_eth_ports = 0; + int port; + + /* If QP state >= init, it is assigned to a port and we can check this + * port only. + */ + if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { + if (attr.qp_state >= IB_QPS_INIT) { + if (rdma_port_get_link_layer(qp->device, attr.port_num) != + IB_LINK_LAYER_INFINIBAND) + return true; + goto lid_check; + } + } + + /* Can't get a quick answer, iterate over all ports */ + for (port = 0; port < qp->device->phys_port_cnt; port++) + if (rdma_port_get_link_layer(qp->device, port) != + IB_LINK_LAYER_INFINIBAND) + num_eth_ports++; + + /* If we have at lease one Ethernet port, RoCE annex declares that + * multicast LID should be ignored. We can't tell at this step if the + * QP belongs to an IB or Ethernet port. + */ + if (num_eth_ports) + return true; + + /* If all the ports are IB, we can check according to IB spec. */ +lid_check: + return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)); +} + int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); @@ -1568,9 +1696,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); |