From a374c57b0764432a80303abee3d1afd1939b5a0a Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 13 Apr 2009 14:40:18 -0700 Subject: sgi-xpc: prevent false heartbeat failures The heartbeat timeout functionality in sgi-xpc is currently not trained to the connection time. If a connection is made and the code is in the last polling window prior to doing a timeout, the next polling window will see the heartbeat as unchanged and initiate a no-heartbeat disconnect. Signed-off-by: Robin Holt Signed-off-by: Dean Nelson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/sgi-xp/xpc_uv.c | 123 +++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 80 deletions(-) (limited to 'drivers/misc/sgi-xp/xpc_uv.c') diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index f7fff4727edb..97f7cb21a0a2 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -46,8 +46,7 @@ struct uv_IO_APIC_route_entry { }; #endif -static atomic64_t xpc_heartbeat_uv; -static DECLARE_BITMAP(xpc_heartbeating_to_mask_uv, XP_MAX_NPARTITIONS_UV); +static struct xpc_heartbeat_uv *xpc_heartbeat_uv; #define XPC_ACTIVATE_MSG_SIZE_UV (1 * GRU_CACHE_LINE_BYTES) #define XPC_ACTIVATE_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \ @@ -423,41 +422,6 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, /* syncing of remote_act_state was just done above */ break; - case XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV: { - struct xpc_activate_mq_msg_heartbeat_req_uv *msg; - - msg = container_of(msg_hdr, - struct xpc_activate_mq_msg_heartbeat_req_uv, - hdr); - part_uv->heartbeat = msg->heartbeat; - break; - } - case XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV: { - struct xpc_activate_mq_msg_heartbeat_req_uv *msg; - - msg = container_of(msg_hdr, - struct xpc_activate_mq_msg_heartbeat_req_uv, - hdr); - part_uv->heartbeat = msg->heartbeat; - - spin_lock_irqsave(&part_uv->flags_lock, irq_flags); - part_uv->flags |= XPC_P_HEARTBEAT_OFFLINE_UV; - spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); - break; - } - case XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV: { - struct xpc_activate_mq_msg_heartbeat_req_uv *msg; - - msg = container_of(msg_hdr, - struct xpc_activate_mq_msg_heartbeat_req_uv, - hdr); - part_uv->heartbeat = msg->heartbeat; - - spin_lock_irqsave(&part_uv->flags_lock, irq_flags); - part_uv->flags &= ~XPC_P_HEARTBEAT_OFFLINE_UV; - spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); - break; - } case XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV: { struct xpc_activate_mq_msg_activate_req_uv *msg; @@ -475,6 +439,7 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, part_uv->act_state_req = XPC_P_ASR_ACTIVATE_UV; part->remote_rp_pa = msg->rp_gpa; /* !!! _pa is _gpa */ part->remote_rp_ts_jiffies = msg_hdr->rp_ts_jiffies; + part_uv->heartbeat_gpa = msg->heartbeat_gpa; if (msg->activate_gru_mq_desc_gpa != part_uv->activate_gru_mq_desc_gpa) { @@ -759,7 +724,7 @@ xpc_send_local_activate_IRQ_uv(struct xpc_partition *part, int act_state_req) /* * !!! Make our side think that the remote partition sent an activate - * !!! message our way by doing what the activate IRQ handler would + * !!! mq message our way by doing what the activate IRQ handler would * !!! do had one really been sent. */ @@ -808,88 +773,80 @@ xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa, static int xpc_setup_rsvd_page_sn_uv(struct xpc_rsvd_page *rp) { - rp->sn.activate_gru_mq_desc_gpa = + xpc_heartbeat_uv = + &xpc_partitions[sn_partition_id].sn.uv.cached_heartbeat; + rp->sn.uv.heartbeat_gpa = uv_gpa(xpc_heartbeat_uv); + rp->sn.uv.activate_gru_mq_desc_gpa = uv_gpa(xpc_activate_mq_uv->gru_mq_desc); return 0; } static void -xpc_send_heartbeat_uv(int msg_type) +xpc_allow_hb_uv(short partid) { - short partid; - struct xpc_partition *part; - struct xpc_activate_mq_msg_heartbeat_req_uv msg; - - /* - * !!! On uv we're broadcasting a heartbeat message every 5 seconds. - * !!! Whereas on sn2 we're bte_copy'ng the heartbeat info every 20 - * !!! seconds. This is an increase in numalink traffic. - * ??? Is this good? - */ - - msg.heartbeat = atomic64_inc_return(&xpc_heartbeat_uv); - - partid = find_first_bit(xpc_heartbeating_to_mask_uv, - XP_MAX_NPARTITIONS_UV); - - while (partid < XP_MAX_NPARTITIONS_UV) { - part = &xpc_partitions[partid]; +} - xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), - msg_type); +static void +xpc_disallow_hb_uv(short partid) +{ +} - partid = find_next_bit(xpc_heartbeating_to_mask_uv, - XP_MAX_NPARTITIONS_UV, partid + 1); - } +static void +xpc_disallow_all_hbs_uv(void) +{ } static void xpc_increment_heartbeat_uv(void) { - xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV); + xpc_heartbeat_uv->value++; } static void xpc_offline_heartbeat_uv(void) { - xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV); + xpc_increment_heartbeat_uv(); + xpc_heartbeat_uv->offline = 1; } static void xpc_online_heartbeat_uv(void) { - xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV); + xpc_increment_heartbeat_uv(); + xpc_heartbeat_uv->offline = 0; } static void xpc_heartbeat_init_uv(void) { - atomic64_set(&xpc_heartbeat_uv, 0); - bitmap_zero(xpc_heartbeating_to_mask_uv, XP_MAX_NPARTITIONS_UV); - xpc_heartbeating_to_mask = &xpc_heartbeating_to_mask_uv[0]; + xpc_heartbeat_uv->value = 1; + xpc_heartbeat_uv->offline = 0; } static void xpc_heartbeat_exit_uv(void) { - xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV); + xpc_offline_heartbeat_uv(); } static enum xp_retval xpc_get_remote_heartbeat_uv(struct xpc_partition *part) { struct xpc_partition_uv *part_uv = &part->sn.uv; - enum xp_retval ret = xpNoHeartbeat; + enum xp_retval ret; - if (part_uv->remote_act_state != XPC_P_AS_INACTIVE && - part_uv->remote_act_state != XPC_P_AS_DEACTIVATING) { + ret = xp_remote_memcpy(uv_gpa(&part_uv->cached_heartbeat), + part_uv->heartbeat_gpa, + sizeof(struct xpc_heartbeat_uv)); + if (ret != xpSuccess) + return ret; - if (part_uv->heartbeat != part->last_heartbeat || - (part_uv->flags & XPC_P_HEARTBEAT_OFFLINE_UV)) { + if (part_uv->cached_heartbeat.value == part->last_heartbeat && + !part_uv->cached_heartbeat.offline) { - part->last_heartbeat = part_uv->heartbeat; - ret = xpSuccess; - } + ret = xpNoHeartbeat; + } else { + part->last_heartbeat = part_uv->cached_heartbeat.value; } return ret; } @@ -904,8 +861,9 @@ xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp, part->remote_rp_pa = remote_rp_gpa; /* !!! _pa here is really _gpa */ part->remote_rp_ts_jiffies = remote_rp->ts_jiffies; + part->sn.uv.heartbeat_gpa = remote_rp->sn.uv.heartbeat_gpa; part->sn.uv.activate_gru_mq_desc_gpa = - remote_rp->sn.activate_gru_mq_desc_gpa; + remote_rp->sn.uv.activate_gru_mq_desc_gpa; /* * ??? Is it a good idea to make this conditional on what is @@ -913,8 +871,9 @@ xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp, */ if (part->sn.uv.remote_act_state == XPC_P_AS_INACTIVE) { msg.rp_gpa = uv_gpa(xpc_rsvd_page); + msg.heartbeat_gpa = xpc_rsvd_page->sn.uv.heartbeat_gpa; msg.activate_gru_mq_desc_gpa = - xpc_rsvd_page->sn.activate_gru_mq_desc_gpa; + xpc_rsvd_page->sn.uv.activate_gru_mq_desc_gpa; xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV); } @@ -1677,6 +1636,10 @@ xpc_init_uv(void) xpc_process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_uv; xpc_get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_uv; xpc_setup_rsvd_page_sn = xpc_setup_rsvd_page_sn_uv; + + xpc_allow_hb = xpc_allow_hb_uv; + xpc_disallow_hb = xpc_disallow_hb_uv; + xpc_disallow_all_hbs = xpc_disallow_all_hbs_uv; xpc_increment_heartbeat = xpc_increment_heartbeat_uv; xpc_offline_heartbeat = xpc_offline_heartbeat_uv; xpc_online_heartbeat = xpc_online_heartbeat_uv; -- cgit v1.2.3-55-g7522