Re: [PATCH v2 10/15] opensm: Add opensm option to specify file name for extra torus-2QoS configuration information.
Hi Jim, On 11:06 Wed 10 Mar , Jim Schutt wrote: Signed-off-by: Jim Schutt jasc...@sandia.gov --- opensm/include/opensm/osm_base.h | 18 ++ opensm/include/opensm/osm_subnet.h |5 + opensm/opensm/main.c |9 + opensm/opensm/osm_subnet.c |1 + opensm/opensm/osm_torus.c |2 +- It breaks to apply at this point. It is because file 'opensm/opensm/osm_torus.c' doesn't exist in previous patches. Could you please resend the patch series with files included? Thanks. Sasha 5 files changed, 34 insertions(+), 1 deletions(-) diff --git a/opensm/include/opensm/osm_base.h b/opensm/include/opensm/osm_base.h index 4e9aaa9..8720c38 100644 --- a/opensm/include/opensm/osm_base.h +++ b/opensm/include/opensm/osm_base.h @@ -277,6 +277,24 @@ BEGIN_C_DECLS #endif /* __WIN__ */ /***/ +/d* OpenSM: Base/OSM_DEFAULT_TORUS_CONF_FILE +* NAME +*OSM_DEFAULT_TORUS_CONF_FILE +* +* DESCRIPTION +*Specifies the default file name for extra torus-2QoS configuration +* +* SYNOPSIS +*/ +#ifdef __WIN__ +#define OSM_DEFAULT_TORUS_CONF_FILE strcat(GetOsmCachePath(), osm-torus-2QoS.conf) +#elif defined(OPENSM_CONFIG_DIR) +#define OSM_DEFAULT_TORUS_CONF_FILE OPENSM_CONFIG_DIR /torus-2QoS.conf +#else +#define OSM_DEFAULT_TORUS_CONF_FILE /etc/opensm/torus-2QoS.conf +#endif /* __WIN__ */ +/***/ + /d* OpenSM: Base/OSM_DEFAULT_PREFIX_ROUTES_FILE * NAME *OSM_DEFAULT_PREFIX_ROUTES_FILE diff --git a/opensm/include/opensm/osm_subnet.h b/opensm/include/opensm/osm_subnet.h index d74a57c..d2d9661 100644 --- a/opensm/include/opensm/osm_subnet.h +++ b/opensm/include/opensm/osm_subnet.h @@ -201,6 +201,7 @@ typedef struct osm_subn_opt { char *guid_routing_order_file; char *sa_db_file; boolean_t sa_db_dump; + char *torus_conf_file; boolean_t do_mesh_analysis; boolean_t exit_on_fatal; boolean_t honor_guid2lid_file; @@ -418,6 +419,10 @@ typedef struct osm_subn_opt { *When TRUE causes OpenSM to dump SA DB at the end of every *light sweep regardless the current verbosity level. * +*torus_conf_file +*Name of the file with extra configuration info for torus-2QoS +*routing engine. +* *exit_on_fatal *If TRUE (default) - SM will exit on fatal subnet initialization *issues. diff --git a/opensm/opensm/main.c b/opensm/opensm/main.c index f396de4..578ae9f 100644 --- a/opensm/opensm/main.c +++ b/opensm/opensm/main.c @@ -231,6 +231,10 @@ static void show_usage(void) Set the order port guids will be routed for the MinHop\n and Up/Down routing algorithms to the guids provided in the\n given file (one to a line)\n\n); + printf(--torus_config path to file\n + This option defines the file name for the extra configuration\n + info needed for the torus-2QoS routing engine. The default\n + name is \'OSM_DEFAULT_TORUS_CONF_FILE\'\n\n); printf(--once, -o\n This option causes OpenSM to configure the subnet\n once, then exit. Ports remain in the ACTIVE state.\n\n); @@ -610,6 +614,7 @@ int main(int argc, char *argv[]) {sm_sl, 1, NULL, 7}, {retries, 1, NULL, 8}, {log_prefix, 1, NULL, 9}, + {torus_config, 1, NULL, 10}, {NULL, 0, NULL, 0} /* Required at the end of the array */ }; @@ -992,6 +997,10 @@ int main(int argc, char *argv[]) SET_STR_OPT(opt.log_prefix, optarg); printf(Log prefix = %s\n, opt.log_prefix); break; + case 10: + SET_STR_OPT(opt.torus_conf_file, optarg); + printf(Torus-2QoS config file = %s\n, opt.torus_conf_file); + break; case 'h': case '?': case ':': diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c index 55b9384..47aa529 100644 --- a/opensm/opensm/osm_subnet.c +++ b/opensm/opensm/osm_subnet.c @@ -758,6 +758,7 @@ void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt) p_opt-guid_routing_order_file = NULL; p_opt-sa_db_file = NULL; p_opt-sa_db_dump = FALSE; + p_opt-torus_conf_file = strdup(OSM_DEFAULT_TORUS_CONF_FILE); p_opt-do_mesh_analysis = FALSE; p_opt-exit_on_fatal = TRUE; p_opt-enable_quirks = FALSE; diff --git a/opensm/opensm/osm_torus.c b/opensm/opensm/osm_torus.c index 7f80034..7c3b550 100644 --- a/opensm/opensm/osm_torus.c +++ b/opensm/opensm/osm_torus.c @@ -9043,7 +9043,7 @@ int torus_build_lfts(void *context) torus-osm = ctx-osm; fabric-osm = ctx-osm; - if
[PATCH v3] opensm/osmeventplugin: added new events to monitor SM
Hi Sasha, Adding new events that allow event plug-in to see when SM finishes heavy sweep and routing configuration, when it updates dump files, when it is no longer master, and when SM port is down: OSM_EVENT_ID_HEAVY_SWEEP_DONE OSM_EVENT_ID_UCAST_ROUTING_DONE OSM_EVENT_ID_ENTERING_STANDBY OSM_EVENT_ID_SM_PORT_DOWN OSM_EVENT_ID_SA_DB_DUMPED The last event is reported when SA DB is actually dumped. Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il --- Changes from V2: - reduced number of events that are reported - rebased to latest master --- opensm/include/opensm/osm_event_plugin.h |7 ++- opensm/opensm/osm_state_mgr.c | 16 +++- opensm/osmeventplugin/src/osmeventplugin.c | 15 +++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/opensm/include/opensm/osm_event_plugin.h b/opensm/include/opensm/osm_event_plugin.h index 33d1920..a565123 100644 --- a/opensm/include/opensm/osm_event_plugin.h +++ b/opensm/include/opensm/osm_event_plugin.h @@ -72,7 +72,12 @@ typedef enum { OSM_EVENT_ID_PORT_SELECT, OSM_EVENT_ID_TRAP, OSM_EVENT_ID_SUBNET_UP, - OSM_EVENT_ID_MAX + OSM_EVENT_ID_MAX, + OSM_EVENT_ID_HEAVY_SWEEP_DONE, + OSM_EVENT_ID_UCAST_ROUTING_DONE, + OSM_EVENT_ID_ENTERING_STANDBY, + OSM_EVENT_ID_SM_PORT_DOWN, + OSM_EVENT_ID_SA_DB_DUMPED } osm_epi_event_id_t; typedef struct osm_epi_port_id { diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c index 81c8f54..3231ae9 100644 --- a/opensm/opensm/osm_state_mgr.c +++ b/opensm/opensm/osm_state_mgr.c @@ -1151,6 +1151,8 @@ static void do_sweep(osm_sm_t * sm) if (!sm-p_subn-subnet_initialization_error) { OSM_LOG_MSG_BOX(sm-p_log, OSM_LOG_VERBOSE, REROUTE COMPLETE); + osm_opensm_report_event(sm-p_subn-p_osm, + OSM_EVENT_ID_UCAST_ROUTING_DONE, NULL); return; } } @@ -1185,6 +1187,8 @@ repeat_discovery: /* Move to DISCOVERING state */ osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_DISCOVER); + osm_opensm_report_event(sm-p_subn-p_osm, + OSM_EVENT_ID_SM_PORT_DOWN, NULL); return; } @@ -1205,6 +1209,8 @@ repeat_discovery: ENTERING STANDBY STATE); /* notify master SM about us */ osm_send_trap144(sm, 0); + osm_opensm_report_event(sm-p_subn-p_osm, + OSM_EVENT_ID_ENTERING_STANDBY, NULL); return; } @@ -1212,6 +1218,9 @@ repeat_discovery: if (sm-p_subn-force_heavy_sweep) goto repeat_discovery; + osm_opensm_report_event(sm-p_subn-p_osm, + OSM_EVENT_ID_HEAVY_SWEEP_DONE, NULL); + OSM_LOG_MSG_BOX(sm-p_log, OSM_LOG_VERBOSE, HEAVY SWEEP COMPLETE); /* If we are MASTER - get the highest remote_sm, and @@ -1314,6 +1323,8 @@ repeat_discovery: OSM_LOG_MSG_BOX(sm-p_log, OSM_LOG_VERBOSE, SWITCHES CONFIGURED FOR UNICAST); + osm_opensm_report_event(sm-p_subn-p_osm, + OSM_EVENT_ID_UCAST_ROUTING_DONE, NULL); if (!sm-p_subn-opt.disable_multicast) { osm_mcast_mgr_process(sm); @@ -1375,7 +1386,10 @@ repeat_discovery: if (osm_log_is_active(sm-p_log, OSM_LOG_VERBOSE) || sm-p_subn-opt.sa_db_dump) - osm_sa_db_file_dump(sm-p_subn-p_osm); + if (!osm_sa_db_file_dump(sm-p_subn-p_osm)) + osm_opensm_report_event(sm-p_subn-p_osm, + OSM_EVENT_ID_SA_DB_DUMPED, NULL); + } /* diff --git a/opensm/osmeventplugin/src/osmeventplugin.c b/opensm/osmeventplugin/src/osmeventplugin.c index b4d9ce9..af68a5c 100644 --- a/opensm/osmeventplugin/src/osmeventplugin.c +++ b/opensm/osmeventplugin/src/osmeventplugin.c @@ -176,6 +176,21 @@ static void report(void *_log, osm_epi_event_id_t event_id, void *event_data) case OSM_EVENT_ID_SUBNET_UP: fprintf(log-log_file, Subnet up reported\n); break; + case OSM_EVENT_ID_HEAVY_SWEEP_DONE: + fprintf(log-log_file, Heavy sweep completed\n); + break; + case OSM_EVENT_ID_UCAST_ROUTING_DONE: + fprintf(log-log_file, Unicast routing completed\n); + break; + case OSM_EVENT_ID_ENTERING_STANDBY: + fprintf(log-log_file, Entering stand-by state\n); + break; + case OSM_EVENT_ID_SM_PORT_DOWN: + fprintf(log-log_file, SM port is down\n); + break; + case OSM_EVENT_ID_SA_DB_DUMPED: + fprintf(log-log_file, SA
[Patch v2] opensm/main.c: force stdout to be line-buffered
When stdout is assigned to a terminal, it is line-buffered. But when opensm's stdout is redirected to a file, stdout becomes block-buffered, which means that '\n' won't cause the buffer to be flushed. Forcing stdout to always be line-buffered and to have a more predictable behavior when used as opensm some_file. Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il --- Changes since V1: - replacing setlinebuf() with an ANSI C compliant setvbuf() - Note: similar patch for ibv_asyncwatch was accepted by Roland: http://www.mail-archive.com/linux-rdma@vger.kernel.org/msg04161.html opensm/opensm/main.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/opensm/opensm/main.c b/opensm/opensm/main.c index 0093aa7..6e6c733 100644 --- a/opensm/opensm/main.c +++ b/opensm/opensm/main.c @@ -618,6 +618,9 @@ int main(int argc, char *argv[]) {NULL, 0, NULL, 0} /* Required at the end of the array */ }; + /* force stdout to be line-buffered */ + setvbuf(stdout, NULL, _IOLBF, 0); + /* Make sure that the opensm and complib were compiled using same modes (debug/free) */ if (osm_is_debug() != cl_is_debug()) { -- 1.5.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH resend] opensm/osm_sa_path_record.c: adding wrapper for pr_rcv_get_path_parms()
Adding non-static wrapper function for pr_rcv_get_path_parms() function to enable calling path record calculation function from outside this file. Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il --- opensm/opensm/osm_sa_path_record.c | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/opensm/opensm/osm_sa_path_record.c b/opensm/opensm/osm_sa_path_record.c index f0d7ca2..2897c7b 100644 --- a/opensm/opensm/osm_sa_path_record.c +++ b/opensm/opensm/osm_sa_path_record.c @@ -764,6 +764,18 @@ Exit: return status; } +ib_api_status_t osm_get_path_params(IN osm_sa_t * sa, + IN const osm_port_t * p_src_port, + IN const osm_port_t * p_dest_port, + IN const uint16_t dlid_ho, + OUT osm_path_parms_t * p_parms) +{ + ib_path_rec_t pr; + memset(pr, 0, sizeof(ib_path_rec_t)); + return pr_rcv_get_path_parms(sa, pr, + p_src_port, p_dest_port, dlid_ho, 0, p_parms); +} + static void pr_rcv_build_pr(IN osm_sa_t * sa, IN const osm_port_t * p_src_port, IN const osm_port_t * p_dest_port, IN const ib_gid_t * p_dgid, -- 1.5.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 10/15] opensm: Add opensm option to specify file name for extra torus-2QoS configuration information.
Hi Sasha, Thanks for taking a look at this. On Thu, 2010-06-10 at 05:25 -0600, Sasha Khapyorsky wrote: Hi Jim, On 11:06 Wed 10 Mar , Jim Schutt wrote: Signed-off-by: Jim Schutt jasc...@sandia.gov --- opensm/include/opensm/osm_base.h | 18 ++ opensm/include/opensm/osm_subnet.h |5 + opensm/opensm/main.c |9 + opensm/opensm/osm_subnet.c |1 + opensm/opensm/osm_torus.c |2 +- It breaks to apply at this point. It is because file 'opensm/opensm/osm_torus.c' doesn't exist in previous patches. Could you please resend the patch series with files included? Thanks. So 7/15 has the patch that adds osm_torus.c as a compressed attachment, because the patch is so big. I sent it that way because I was afraid it would otherwise be rejected by vger. So you want me to resend with that big patch inline? Also, I have accumulated a few bug fixes to torus-2QoS that I haven't posted yet. I can 1) repost the patch series with no attachments, and add the bugfix patches at the end of series 2) repost a v3 patchset with these fixes merged. 3) do something else that you prefer. Let me know? -- Jim Sasha -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] return no path when path does not exist
return OSM_NO_PATH (instead of port num) when path does not exists. this will also be reported as error in the log. Signed-off-by: Eli Dorfman e...@voltaire.com --- opensm/opensm/osm_switch.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c index 311c4f7..b621852 100644 --- a/opensm/opensm/osm_switch.c +++ b/opensm/opensm/osm_switch.c @@ -628,6 +628,8 @@ uint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw, a black hole that will destroy the Earth in a firey conflagration. */ least_hops = osm_switch_get_least_hops(p_sw, base_lid); + if (least_hops == OSM_NO_PATH) + return OSM_NO_PATH; for (port_num = 1; port_num num_ports; port_num++) if (osm_switch_get_hop_count(p_sw, base_lid, port_num) == least_hops) -- 1.5.5 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
converting ib-mgmt scripts to executables
Sasha, Before we do the work, would there be any issue converting a couple of the ib-diag scripts to executables? Specifically, we'd like to have ibchecknet and ibcheckerrors functionality available on Windows. - Sean -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH/RFC] mlx4_core: module param to limit msix vec allocation
The mlx4_core driver allocates 'nreq' msix vectors (and irqs), where: nreq = min_t(int, dev-caps.num_eqs - dev-caps.reserved_eqs, num_possible_cpus() + 1); ConnectX HCAs support 512 event queues (4 reserved). On a system with enough processors, we get: mlx4_core 0006:01:00.0: Requested 508 vectors, but only 256 MSI-X vectors available, trying again Further attempts (by other drivers) to allocate interrupts fail, because mlx4_core got 'em all. How about this? Signed-off-by: Arthur Kepner akep...@sgi.com --- main.c |8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index e3e0d54..0a316d0 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -68,6 +68,10 @@ static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, attempt to use MSI-X if nonzero); +static int max_msi_x_vec = 64; +module_param(max_msi_x_vec, int, 0444); +MODULE_PARM_DESC(max_msi_x_vec, max MSI-X vectors we'll attempt to allocate); + #else /* CONFIG_PCI_MSI */ #define msi_x (0) @@ -968,8 +972,10 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) int i; if (msi_x) { + nreq = min_t(int, num_possible_cpus() + 1, max_msi_x_vec); nreq = min_t(int, dev-caps.num_eqs - dev-caps.reserved_eqs, -num_possible_cpus() + 1); +nreq); + entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/19 V4] mlx4 SRIOV support
Hello Roland, This is another round of SRIOV support patches for mlx4. Currently the support is for the mlx4_core and mlx4_en modules. The main changes from previous round are: 1. Events on the Communication channel, no more polling. 2. Steering mechanism change for Ethernet traffic. Thanks, Yevgeny -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/19 V4] mlx4_core: identify function as pf or vf
Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- include/linux/mlx4/device.h | 19 +++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7a7f9c1..84de4a6 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -42,6 +42,9 @@ enum { MLX4_FLAG_MSI_X = 1 0, MLX4_FLAG_OLD_PORT_CMDS = 1 1, + MLX4_FLAG_MASTER= 1 2, + MLX4_FLAG_SLAVE = 1 3, + MLX4_FLAG_SRIOV = 1 4, }; enum { @@ -376,6 +379,7 @@ struct mlx4_av { struct mlx4_dev { struct pci_dev *pdev; unsigned long flags; + unsigned long num_slaves; struct mlx4_capscaps; struct radix_tree_root qp_table_tree; u32 rev_id; @@ -401,6 +405,21 @@ struct mlx4_init_port_param { if (((type) == MLX4_PORT_TYPE_IB ? (dev)-caps.port_mask : \ ~(dev)-caps.port_mask) 1 ((port) - 1)) +static inline int mlx4_is_slave(struct mlx4_dev *dev) +{ + return dev-flags MLX4_FLAG_SLAVE; +} + +static inline int mlx4_is_master(struct mlx4_dev *dev) +{ + return dev-flags MLX4_FLAG_MASTER; +} + +static inline int mlx4_is_mfunc(struct mlx4_dev *dev) +{ + return dev-flags (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE); +} + int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, struct mlx4_buf *buf); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/19 V4] mlx4_core: add multi-function communication channel
The communication channel consists of 2 registers per vf (a slave function) that are shared with the pf (the master function), as well as a new command for inter-function memory copying (only exposed to the master). The communication channel is used to establish a Virtual HCA Command Register (vHCR) in each slave function, which allows it to pass FW commands to the master function for execution. The slave also uses the vHCR to pull slave-specific events from the master. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 746 ++- drivers/net/mlx4/en_port.h |5 - drivers/net/mlx4/eq.c | 89 +- drivers/net/mlx4/fw.c |8 + drivers/net/mlx4/main.c | 17 +- drivers/net/mlx4/mlx4.h | 72 - include/linux/mlx4/cmd.h| 12 +- include/linux/mlx4/device.h |3 +- 8 files changed, 921 insertions(+), 31 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 23cee7b..672e13b 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -141,6 +141,46 @@ static int mlx4_status_to_errno(u8 status) return trans_table[status]; } +static int comm_pending(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 status = readl(priv-mfunc.comm-slave_read); + + return (swab32(status) 30) != priv-cmd.comm_toggle; +} + +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + unsigned long end; + u32 val; + + /* First, verify that the master reports correct status */ + if (comm_pending(dev)) { + mlx4_warn(dev, Communication channel is not idle\n); + return -EAGAIN; + } + + /* Write command */ + if (cmd == MLX4_COMM_CMD_RESET) + priv-cmd.comm_toggle = 0; + else if (++priv-cmd.comm_toggle 2) + priv-cmd.comm_toggle = 1; + val = param | (cmd 16) | (priv-cmd.comm_toggle 30); + __raw_writel((__force u32) cpu_to_be32(val), priv-mfunc.comm-slave_write); + wmb(); + + end = msecs_to_jiffies(timeout) + jiffies; + while (comm_pending(dev) time_before(jiffies, end)) + cond_resched(); + + if (comm_pending(dev)) { + mlx4_warn(dev, Communication channel timed out\n); + return -ETIMEDOUT; + } + return 0; +} + static int cmd_pending(struct mlx4_dev *dev) { u32 status = readl(mlx4_priv(dev)-cmd.hcr + HCR_STATUS_OFFSET); @@ -208,6 +248,33 @@ out: return ret; } +static int mlx4_slave_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, +int out_is_imm, u32 in_modifier, u8 op_modifier, +u16 op, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vhcr *vhcr = priv-mfunc.vhcr; + int ret; + + down(priv-cmd.poll_sem); + vhcr-in_param = in_param; + vhcr-out_param = out_param ? *out_param : 0; + vhcr-in_modifier = in_modifier; + vhcr-timeout = timeout; + vhcr-op = op; + vhcr-token = CMD_POLL_TOKEN; + vhcr-op_modifier = op_modifier; + vhcr-errno = 0; + ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, MLX4_COMM_TIME + timeout); + if (!ret) { + if (out_is_imm) + *out_param = vhcr-out_param; + ret = vhcr-errno; + } + up(priv-cmd.poll_sem); + return ret; +} + static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout) @@ -315,12 +382,646 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, if (mlx4_priv(dev)-cmd.use_events) return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm, in_modifier, op_modifier, op, timeout); + + if (mlx4_is_slave(dev)) + return mlx4_slave_cmd_poll(dev, in_param, out_param, out_is_imm, +in_modifier, op_modifier, op, timeout); else return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm, in_modifier, op_modifier, op, timeout); } EXPORT_SYMBOL_GPL(__mlx4_cmd); +static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr, + int slave, u64 slave_addr, + int size, int is_read) +{ + u64 in_param; + u64 out_param; + + if ((slave_addr 0xfff) | (master_addr 0xfff) | + (slave ~0x7f) | (size 0xff)) { + mlx4_err(dev, Bad access mem params - slave_addr:0x%llx + master_addr:0x%llx
[PATCH 03/19 V4] mlx4_core: add WRITE_MTT support
Used by vfs to modify mtts, since they cannot access in-memory mtts directly. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 10 +- drivers/net/mlx4/mlx4.h |5 +++ drivers/net/mlx4/mr.c | 67 ++ 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 672e13b..eac3b21 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -37,8 +37,6 @@ #include linux/pci.h #include linux/errno.h -#include linux/mlx4/cmd.h - #include asm/io.h #include mlx4.h @@ -498,6 +496,14 @@ static struct mlx4_cmd_info { .wrapper = NULL }, { + .opcode = MLX4_CMD_WRITE_MTT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .verify = NULL, /* need verifier */ + .wrapper = mlx4_WRITE_MTT_wrapper + }, + { .opcode = MLX4_CMD_SYNC_TPT, .has_inbox = true, .has_outbox = false, diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index fac5d6e..71b191e 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -46,6 +46,7 @@ #include linux/mlx4/device.h #include linux/mlx4/driver.h #include linux/mlx4/doorbell.h +#include linux/mlx4/cmd.h #define DRV_NAME mlx4_core #define PFXDRV_NAME : @@ -420,6 +421,10 @@ void mlx4_cleanup_qp_table(struct mlx4_dev *dev); void mlx4_cleanup_srq_table(struct mlx4_dev *dev); void mlx4_cleanup_mcg_table(struct mlx4_dev *dev); +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, +struct mlx4_cmd_mailbox *inbox, +struct mlx4_cmd_mailbox *outbox); + void mlx4_start_catas_poll(struct mlx4_dev *dev); void mlx4_stop_catas_poll(struct mlx4_dev *dev); void mlx4_catas_init(void); diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 3dc69be..67c0539 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -263,6 +263,35 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B); } +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, +struct mlx4_cmd_mailbox *inbox, +struct mlx4_cmd_mailbox *outbox) +{ + struct mlx4_mtt mtt; + u64 *page_list = inbox-buf; + int i; + + /* Call the SW implementation of write_mtt: +* - Prepare a dummy mtt struct +* - Translate inbox contents to simple addresses in host endianess */ + mtt.first_seg = 0; + mtt.order = 0; + mtt.page_shift = 0; + for (i = 0; i vhcr-in_modifier; ++i) + page_list[i + 2] = be64_to_cpu(page_list[i + 2]) ~1ULL; + vhcr-errno = mlx4_write_mtt(dev, mtt, be64_to_cpu(page_list[0]), + vhcr-in_modifier, + page_list + 2); + return 0; +} + +static int mlx4_WRITE_MTT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int num_entries) +{ + return mlx4_cmd(dev, mailbox-dma, num_entries, 0, MLX4_CMD_WRITE_MTT, + MLX4_CMD_TIME_CLASS_A); +} + int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr) { @@ -414,24 +443,50 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int start_index, int npages, u64 *page_list) { + struct mlx4_cmd_mailbox *mailbox = NULL; int chunk; - int err; + int err = 0; + __be64 *inbox = NULL; + int i; if (mtt-order 0) return -EINVAL; + if (mlx4_is_slave(dev)) { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox-buf; + } + while (npages 0) { - chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages); - err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list); + if (mlx4_is_slave(dev)) { + int s = mtt-first_seg * dev-caps.mtts_per_seg + start_index; + chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - dev-caps.mtts_per_seg, npages); + if (s / (PAGE_SIZE / sizeof (u64)) != + (s + chunk - 1) / (PAGE_SIZE / sizeof (u64))) +
[PATCH 04/19 V4] mlx4_core: add slave resource allocation
All QPs/CQs/SRQs/MPTs/MTTs are allocated from shared pools, which are owned by the master. In addition, all backing ICM memory for these objects is managed by the master. To allow slaves to allocate resources, ICM allocation is separated from the rest of the object state, which is held at the slave. Slaves can then reserve resource ranges and allocate ICM over the comm channel. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 110 + drivers/net/mlx4/cq.c| 91 +--- drivers/net/mlx4/mlx4.h | 27 drivers/net/mlx4/mr.c| 125 ++ drivers/net/mlx4/qp.c| 151 +- drivers/net/mlx4/srq.c | 88 --- include/linux/mlx4/cmd.h |2 + 7 files changed, 496 insertions(+), 98 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index eac3b21..3c95da7 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -418,6 +418,100 @@ static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr, MLX4_CMD_TIME_CLASS_A); } +static int mlx4_RESOURCE_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + u32 param1 = *((u32 *) vhcr-in_param); + u32 param2 = *(((u32 *) vhcr-in_param) + 1); + int ret; + +#if 0 + char *res[] = {QP, CQ, SRQ, MPT, MTT}; + mlx4_warn(dev, resource wrapper - %s (mode: %s) type:%s param1:%d param2:%d\n, + vhcr-op == MLX4_CMD_ALLOC_RES ? allocate : free, + vhcr-op_modifier == ICM_RESERVE ? reserve : + (vhcr-op_modifier == ICM_ALLOC ? alloc : reserve+alloc), + res[vhcr-in_modifier], param1, param2); +#endif + + vhcr-errno = 0; + switch (vhcr-in_modifier) { + case RES_QP: + switch (vhcr-op_modifier) { + case ICM_RESERVE: + if (vhcr-op == MLX4_CMD_ALLOC_RES) { + vhcr-errno = mlx4_qp_reserve_range(dev, param1, param2, ret); + if (!vhcr-errno) + vhcr-out_param = ret; + } else { + mlx4_qp_release_range(dev, param1, param2); + } + break; + case ICM_ALLOC: + if (vhcr-op == MLX4_CMD_ALLOC_RES) + vhcr-errno = mlx4_qp_alloc_icm(dev, param1); + else + mlx4_qp_free_icm(dev, param1); + break; + default: + vhcr-errno = -EINVAL; + } + break; + case RES_CQ: + if (vhcr-op == MLX4_CMD_ALLOC_RES) { + vhcr-errno = mlx4_cq_alloc_icm(dev, ret); + if (!vhcr-errno) + vhcr-out_param = ret; + } else + mlx4_cq_free_icm(dev, param1); + break; + case RES_SRQ: + if (vhcr-op == MLX4_CMD_ALLOC_RES) { + vhcr-errno = mlx4_srq_alloc_icm(dev, ret); + if (!vhcr-errno) + vhcr-out_param = ret; + } else + mlx4_srq_free_icm(dev, param1); + break; + case RES_MPT: + switch (vhcr-op_modifier) { + case ICM_RESERVE: + if (vhcr-op == MLX4_CMD_ALLOC_RES) { + ret = mlx4_mr_reserve(dev); + if (ret == -1) + vhcr-errno = -ENOMEM; + else + vhcr-out_param = ret; + } else + mlx4_mr_release(dev, param1); + break; + case ICM_ALLOC: + if (vhcr-op == MLX4_CMD_ALLOC_RES) + vhcr-errno = mlx4_mr_alloc_icm(dev, param1); + else + mlx4_mr_free_icm(dev, param1); + break; + default: + vhcr-errno = -EINVAL; + } + break; + case RES_MTT: + if (vhcr-op == MLX4_CMD_ALLOC_RES) { + ret = mlx4_alloc_mtt_range(dev, param1 /* order */); + if (ret == -1) + vhcr-errno = -ENOMEM; +
[PATCH 05/19 V4] mlx4_core: add port para-virtualization
Ports are a shared resource among functions, so special behavior is needed here: - Bring up ports if at least one function has done so. - Bring down ports if all functions have done so. - Aggregate IB port capabilities - Set max mtu among for Eth port - Ensure steering is not broken for Eth ports. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 58 +++ drivers/net/mlx4/en_netdev.c | 32 +++--- drivers/net/mlx4/en_port.c |1 + drivers/net/mlx4/en_port.h | 12 ++- drivers/net/mlx4/en_rx.c | 11 +-- drivers/net/mlx4/fw.c| 79 ++- drivers/net/mlx4/main.c | 22 +++-- drivers/net/mlx4/mlx4.h | 25 +- drivers/net/mlx4/port.c | 220 + include/linux/mlx4/cmd.h |2 + include/linux/mlx4/device.h |5 +- 11 files changed, 399 insertions(+), 68 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 3c95da7..6637d5a 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -506,6 +506,24 @@ static int mlx4_RESOURCE_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh } else mlx4_free_mtt_range(dev, param1 /* first */, param2 /* order */); break; + case RES_MAC: + switch (vhcr-op) { + case MLX4_CMD_ALLOC_RES: + ret = mlx4_register_mac(dev, vhcr-op_modifier, + vhcr-in_param, (int *) vhcr-out_param); + vhcr-errno = ret; + break; + case MLX4_CMD_FREE_RES: + mlx4_unregister_mac(dev, vhcr-op_modifier, vhcr-in_param); + break; + case MLX4_CMD_REPLACE_RES: + ret = mlx4_replace_mac(dev, vhcr-op_modifier, + vhcr-out_param, vhcr-in_param); + vhcr-errno = ret; + break; + default: + vhcr-errno = -EINVAL; + } default: vhcr-errno = -EINVAL; } @@ -541,6 +559,38 @@ static struct mlx4_cmd_info { }, { + .opcode = MLX4_CMD_INIT_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .verify = NULL, + .wrapper = mlx4_INIT_PORT_wrapper}, + { + .opcode = MLX4_CMD_CLOSE_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .verify = NULL, + .wrapper = mlx4_CLOSE_PORT_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_PORT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .verify = NULL, + .wrapper = mlx4_QUERY_PORT_wrapper + }, + { + .opcode = MLX4_CMD_SET_PORT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .verify = NULL, + .wrapper = mlx4_SET_PORT_wrapper + }, + + { .opcode = MLX4_CMD_SW2HW_EQ, .has_inbox = true, .has_outbox = false, @@ -574,6 +624,14 @@ static struct mlx4_cmd_info { }, { + .opcode = MLX4_CMD_REPLACE_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .verify = NULL, + .wrapper = mlx4_RESOURCE_wrapper + }, + { .opcode = MLX4_CMD_SW2HW_MPT, .has_inbox = true, .has_outbox = false, diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index 96180c0..7389fa2 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -146,9 +146,8 @@ static void mlx4_en_do_set_mac(struct work_struct *work) mutex_lock(mdev-state_lock); if (priv-port_up) { /* Remove old MAC and insert the new one */ - mlx4_unregister_mac(mdev-dev, priv-port, priv-mac_index); - err = mlx4_register_mac(mdev-dev, priv-port, - priv-mac, priv-mac_index); + err = mlx4_replace_mac(mdev-dev, priv-port, + priv-base_qpn, priv-mac); if (err) en_err(priv, Failed changing HW MAC address\n); } else @@ -589,10 +588,19 @@ int mlx4_en_start_port(struct net_device *dev) ++rx_index; } + /* Set port mac number */ + en_dbg(DRV, priv, Setting mac for port %d\n, priv-port); + err =
[PATCH 06/19 V4] mlx4_core: dispatch slave asynch events
Affiliated and unaffiliated asynch events are handled by a single EQ owned by the master. A per-slave SW event queue is added to log and dispatch both slave-specific events and events that apply to all slaves. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 12 ++- drivers/net/mlx4/eq.c | 92 +++--- drivers/net/mlx4/mlx4.h |8 3 files changed, 105 insertions(+), 7 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 6637d5a..ce467ca 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -622,6 +622,14 @@ static struct mlx4_cmd_info { .verify = NULL, .wrapper = mlx4_RESOURCE_wrapper }, + { + .opcode = MLX4_CMD_GET_EVENT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .verify = NULL, + .wrapper = mlx4_GET_EVENT_wrapper + }, { .opcode = MLX4_CMD_REPLACE_RES, @@ -1168,8 +1176,10 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) if (!priv-mfunc.master.slave_state) goto err_comm; - for (i = 0; i dev-num_slaves; ++i) + for (i = 0; i dev-num_slaves; ++i) { priv-mfunc.master.slave_state[i].last_cmd = MLX4_COMM_CMD_RESET; + spin_lock_init(priv-mfunc.master.slave_state[i].lock); + } INIT_DELAYED_WORK(priv-mfunc.comm_work, mlx4_master_poll_comm); priv-mfunc.comm_wq = create_singlethread_workqueue(mlx4_comm); diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 3c1aa18..e0cba6f 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -161,6 +161,61 @@ static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq) return !!(eqe-owner 0x80) ^ !!(eq-cons_index eq-nent) ? NULL : eqe; } +void mlx4_slave_event(struct mlx4_dev *dev, int slave, u8 type, u8 port, u32 param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *ctx = priv-mfunc.master.slave_state[slave]; + unsigned long flags; + + if (ctx-last_cmd != MLX4_COMM_CMD_VHCR_POST) { + mlx4_warn(dev, received event for inactive slave:%d\n, slave); + return; + } + + /* Unconditionally add the new event - during overflows, we drop the +* oldest events */ + spin_lock_irqsave(ctx-lock, flags); + ctx-eq[ctx-eq_pi MLX4_MFUNC_EQE_MASK].type = type; + ctx-eq[ctx-eq_pi MLX4_MFUNC_EQE_MASK].port = port; + ctx-eq[ctx-eq_pi MLX4_MFUNC_EQE_MASK].param = param; + ++ctx-eq_pi; + spin_unlock_irqrestore(ctx-lock, flags); +} + +static void mlx4_slave_event_all(struct mlx4_dev *dev, u8 type, u8 port, u32 param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + for (i = 0; i dev-num_slaves; ++i) + if (priv-mfunc.master.slave_state[i].last_cmd == MLX4_COMM_CMD_VHCR_POST) + mlx4_slave_event(dev, i, type, port, param); +} + +int mlx4_GET_EVENT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, +struct mlx4_cmd_mailbox *inbox, +struct mlx4_cmd_mailbox *outbox) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *ctx = priv-mfunc.master.slave_state[slave]; + unsigned long flags; + + spin_lock_irqsave(ctx-lock, flags); + if (ctx-eq_ci == ctx-eq_pi) { + vhcr-out_param = MLX4_EVENT_TYPE_NONE; + } else if ((u16) (ctx-eq_pi - ctx-eq_ci) MLX4_MFUNC_MAX_EQES) { + ctx-eq_ci = ctx-eq_pi - MLX4_MFUNC_MAX_EQES; + vhcr-out_param = MLX4_EVENT_TYPE_EQ_OVERFLOW; + } else { + vhcr-out_param = ctx-eq[ctx-eq_ci MLX4_MFUNC_EQE_MASK].type | + ((u64) ctx-eq[ctx-eq_ci MLX4_MFUNC_EQE_MASK].port 8) | + ((u64) ctx-eq[ctx-eq_ci MLX4_MFUNC_EQE_MASK].param 32); + ++ctx-eq_ci; + } + spin_unlock_irqrestore(ctx-lock, flags); + return 0; +} + static int mlx4_GET_EVENT(struct mlx4_dev *dev, struct mlx4_slave_eqe *eqe) { int ret; @@ -206,14 +261,26 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) case MLX4_EVENT_TYPE_PATH_MIG_FAILED: case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: - mlx4_qp_event(dev, be32_to_cpu(eqe-event.qp.qpn) 0xff, - eqe-type); + if (mlx4_is_master(dev)) { + /* TODO: forward only to slave owning the QP
[PATCH 08/19 V4] mlx4_core: multi-function resource setup
Only master function needs to configure eq asynch events, and initialize resource allocators. Only the master function polls for fatal events. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cq.c |4 +++ drivers/net/mlx4/eq.c | 52 +- drivers/net/mlx4/intf.c |6 +++- drivers/net/mlx4/main.c | 40 ++--- drivers/net/mlx4/mcg.c |6 + drivers/net/mlx4/mr.c |6 + drivers/net/mlx4/pd.c |7 - drivers/net/mlx4/qp.c | 28 +- drivers/net/mlx4/srq.c |4 +++ include/linux/mlx4/device.h |1 + 10 files changed, 109 insertions(+), 45 deletions(-) diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index c896f71..89af831 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -352,6 +352,8 @@ int mlx4_init_cq_table(struct mlx4_dev *dev) spin_lock_init(cq_table-lock); INIT_RADIX_TREE(cq_table-tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; err = mlx4_bitmap_init(cq_table-bitmap, dev-caps.num_cqs, dev-caps.num_cqs - 1, dev-caps.reserved_cqs, 0); @@ -363,6 +365,8 @@ int mlx4_init_cq_table(struct mlx4_dev *dev) void mlx4_cleanup_cq_table(struct mlx4_dev *dev) { + if (mlx4_is_slave(dev)) + return; /* Nothing to do to clean up radix_tree */ mlx4_bitmap_cleanup(mlx4_priv(dev)-cq_table.bitmap); } diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index e0cba6f..1cb692d 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -699,6 +699,7 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); int err; int i; + u32 round_eqs = roundup_pow_of_two(dev-caps.num_eqs); priv-eq_table.uar_map = kcalloc(sizeof *priv-eq_table.uar_map, mlx4_num_eq_uar(dev), GFP_KERNEL); @@ -707,33 +708,38 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) goto err_out_free; } - err = mlx4_bitmap_init(priv-eq_table.bitmap, dev-caps.num_eqs, - dev-caps.num_eqs - 1, dev-caps.reserved_eqs, 0); + err = mlx4_bitmap_init(priv-eq_table.bitmap, round_eqs, round_eqs - 1, + dev-caps.reserved_eqs, round_eqs - dev-caps.num_eqs); if (err) goto err_out_free; for (i = 0; i mlx4_num_eq_uar(dev); ++i) priv-eq_table.uar_map[i] = NULL; - err = mlx4_map_clr_int(dev); - if (err) - goto err_out_bitmap; + if (!mlx4_is_slave(dev)) { + err = mlx4_map_clr_int(dev); + if (err) + goto err_out_bitmap; - priv-eq_table.clr_mask = - swab32(1 (priv-eq_table.inta_pin 31)); - priv-eq_table.clr_int = priv-clr_base + - (priv-eq_table.inta_pin 32 ? 4 : 0); + priv-eq_table.clr_mask = + swab32(1 (priv-eq_table.inta_pin 31)); + priv-eq_table.clr_int = priv-clr_base + + (priv-eq_table.inta_pin 32 ? 4 : 0); + } priv-eq_table.irq_names = kmalloc(MLX4_IRQNAME_SIZE * (dev-caps.num_comp_vectors + 1), GFP_KERNEL); if (!priv-eq_table.irq_names) { err = -ENOMEM; - goto err_out_bitmap; + i = 0; + goto err_out_unmap; } for (i = 0; i dev-caps.num_comp_vectors; ++i) { - err = mlx4_create_eq(dev, dev-caps.num_cqs + MLX4_NUM_SPARE_EQE, + err = mlx4_create_eq(dev, dev-caps.num_cqs - + dev-caps.reserved_cqs + + MLX4_NUM_SPARE_EQE, (dev-flags MLX4_FLAG_MSI_X) ? i : 0, priv-eq_table.eq[i]); if (err) { @@ -791,11 +797,13 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv-eq_table.have_irq = 1; } - err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, - priv-eq_table.eq[dev-caps.num_comp_vectors].eqn); - if (err) - mlx4_warn(dev, MAP_EQ for async EQ %d failed (%d)\n, - priv-eq_table.eq[dev-caps.num_comp_vectors].eqn, err); + if (!mlx4_is_slave(dev)) { /* hw async events cannot be shared */ + err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, + priv-eq_table.eq[dev-caps.num_comp_vectors].eqn); + if (err) + mlx4_warn(dev, MAP_EQ for async EQ %d failed (%d)\n, +
[PATCH 09/19 V4] mlx4_core: boot sriov
Add virtual function device ids. Setting VF flag to device private data Establish comm channel when sriov is enabled, and boot vfs through it. Each slave gets one msi-X for completions, the master also gets one for async events. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/eq.c |5 +- drivers/net/mlx4/main.c | 307 +++--- drivers/net/mlx4/mlx4.h |4 + 3 files changed, 240 insertions(+), 76 deletions(-) diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 1cb692d..9126c8e 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -805,13 +805,14 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv-eq_table.eq[dev-caps.num_comp_vectors].eqn, err); } - for (i = 0; i dev-caps.num_comp_vectors + 1; ++i) + for (i = 0; i dev-caps.num_comp_vectors + !(mlx4_is_slave(dev)); ++i) eq_set_ci(priv-eq_table.eq[i], 1); return 0; err_out_async: - mlx4_free_eq(dev, priv-eq_table.eq[dev-caps.num_comp_vectors]); + if (!mlx4_is_slave(dev)) + mlx4_free_eq(dev, priv-eq_table.eq[dev-caps.num_comp_vectors]); err_out_comp: i = dev-caps.num_comp_vectors; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index f67f992..3331c33 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -74,6 +74,23 @@ MODULE_PARM_DESC(msi_x, attempt to use MSI-X if nonzero); #endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_PCI_IOV + +static int sr_iov; +module_param(sr_iov, int, 0444); +MODULE_PARM_DESC(sr_iov, enable #sr_iov functions if sr_iov 0); + +static int probe_vf; +module_param(probe_vf, int, 0444); +MODULE_PARM_DESC(probe_vf, number of vfs to probe by pf driver (sr_iov 0)); + +#else /* CONFIG_PCI_IOV */ + +#define sr_iov 0 +#define probe_vf 0 + +#endif /* CONFIG_PCI_IOV */ + static char mlx4_version[] __devinitdata = DRV_NAME : Mellanox ConnectX core driver v DRV_VERSION ( DRV_RELDATE )\n; @@ -780,12 +797,56 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_free_icm(dev, priv-fw.aux_icm, 0); } +static void mlx4_slave_exit(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + down(priv-cmd.poll_sem); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) + mlx4_warn(dev, Failed to close slave function.\n); + up(priv-cmd.poll_sem); +} + static void mlx4_close_hca(struct mlx4_dev *dev) { - mlx4_CLOSE_HCA(dev, 0); - mlx4_free_icms(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, mlx4_priv(dev)-fw.fw_icm, 0); + if (mlx4_is_slave(dev)) + mlx4_slave_exit(dev); + else { + mlx4_CLOSE_HCA(dev, 0); + mlx4_free_icms(dev); + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, mlx4_priv(dev)-fw.fw_icm, 0); + } +} + +static int mlx4_init_slave(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 dma = (u64) priv-mfunc.vhcr_dma; + + down(priv-cmd.poll_sem); + mlx4_warn(dev, Sending reset\n); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) + goto err; + mlx4_warn(dev, Sending vhcr0\n); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma 48, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma 32, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma 16, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) + goto err; + up(priv-cmd.poll_sem); + return 0; + +err: + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); + up(priv-cmd.poll_sem); + return -EIO; } static int mlx4_init_hca(struct mlx4_dev *dev) @@ -799,51 +860,65 @@ static int mlx4_init_hca(struct mlx4_dev *dev) u64 icm_size; int err; - err = mlx4_QUERY_FW(dev); - if (err) { - if (err == -EACCES) - mlx4_info(dev, non-primary physical function, skipping.\n); - else - mlx4_err(dev, QUERY_FW command failed, aborting.\n); - return err; - } + if (!mlx4_is_slave(dev)) { + err = mlx4_QUERY_FW(dev); + if (err) { + if (err == -EACCES) + mlx4_info(dev, non-primary physical function, skipping.\n); + else + mlx4_err(dev, QUERY_FW command failed, aborting.\n); + return err; +
[PATCH 10/19 V4] mlx4_core: Determine primary physical function
In multifunctional devices, only the primary function would succeed to execute QUERY_FW command, all other would fail with _EACCES error. Ownership on the device can also be claimed by reading a descriptor before sw reset. If it is 0, no one claimed ownership on the device so far, otherwise, you are not the owner. A physical function that is not primary would behave as slave. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/fw.c |4 drivers/net/mlx4/fw.h |1 + drivers/net/mlx4/main.c | 32 ++-- drivers/net/mlx4/mlx4.h |2 ++ drivers/net/mlx4/reset.c| 33 + include/linux/mlx4/device.h |2 ++ 6 files changed, 72 insertions(+), 2 deletions(-) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index dc0570f..d1427e5 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -309,6 +309,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap-max_rdma_global = 1 (field 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET); dev_cap-local_ca_ack_delay = field 0x1f; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET); + dev_cap-pf_num = field; + if (dev_cap-pf_num 1) + dev-flags |= MLX4_FLAG_MASTER; MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET); dev_cap-num_ports = field 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET); diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index d066c69..a9d7e55 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -64,6 +64,7 @@ struct mlx4_dev_cap { int max_responder_per_qp; int max_rdma_global; int local_ca_ack_delay; + int pf_num; int num_ports; u32 max_msg_sz; int ib_mtu[MLX4_MAX_PORTS + 1]; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 3331c33..9dca6f4 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -191,6 +191,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) return -ENODEV; } + dev-caps.pf_num = dev_cap-pf_num; dev-caps.num_ports = dev_cap-num_ports; for (i = 1; i = dev-caps.num_ports; ++i) { dev-caps.vl_cap[i] = dev_cap-max_vl[i]; @@ -1296,6 +1297,19 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) /* We reset the device and enable SRIOV only for physical devices */ if (!mlx4_is_slave(dev)) { + /* Claim ownership on the device, +* if already taken, act as slave*/ + err = mlx4_get_ownership(dev); + if (err) { + if (err 0) { + goto err_free_dev; + } else { + err = 0; + dev-flags |= MLX4_FLAG_SLAVE; + goto slave_start; + } + } + /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left @@ -1317,6 +1331,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) } } +slave_start: if (mlx4_cmd_init(dev)) { mlx4_err(dev, Failed to init command interface, aborting.\n); goto err_sriov; @@ -1332,8 +1347,17 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) } err = mlx4_init_hca(dev); - if (err) - goto err_cmd; + if (err) { + if (err == -EACCES) { + /* Not primary Physical function +* Running in slave mode */ + mlx4_cmd_cleanup(dev); + dev-flags |= MLX4_FLAG_SLAVE; + dev-flags = ~MLX4_FLAG_MASTER; + goto slave_start; + } else + goto err_cmd; + } /* In master functions, the communication channel must be initialized after obtaining * its address from fw */ @@ -1422,6 +1446,8 @@ err_sriov: pci_disable_sriov(pdev); err_free_dev: + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); kfree(priv); err_release_regions: @@ -1490,6 +1516,8 @@ static void mlx4_remove_one(struct pci_dev *pdev) pci_disable_sriov(pdev); } + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); kfree(priv); pci_release_regions(pdev); pci_disable_device(pdev); diff --git a/drivers/net/mlx4/mlx4.h
[PATCH 11/19 V4] mlx4_core: Activating ports according to function number
In devices with multiple physical functions, each function activates only one port, according to the function number. Even functions activate port 1, odd functions activate port2. For every virtual function we query the FW to which physical function it belongs, as all the functions are served by the master function. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c |8 drivers/net/mlx4/fw.c | 36 ++-- drivers/net/mlx4/fw.h |1 + drivers/net/mlx4/main.c | 19 --- drivers/net/mlx4/mlx4.h |3 +++ drivers/net/mlx4/port.c |9 + include/linux/mlx4/cmd.h|1 + include/linux/mlx4/device.h |6 +++--- 8 files changed, 67 insertions(+), 16 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index a4722e2..b25e40e 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -423,9 +423,11 @@ static int mlx4_RESOURCE_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox) { + struct mlx4_priv *priv = mlx4_priv(dev); u32 param1 = *((u32 *) vhcr-in_param); u32 param2 = *(((u32 *) vhcr-in_param) + 1); int ret; + u8 pf_num = priv-mfunc.master.slave_state[slave].pf_num; #if 0 char *res[] = {QP, CQ, SRQ, MPT, MTT}; @@ -508,6 +510,7 @@ static int mlx4_RESOURCE_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh mlx4_free_mtt_range(dev, param1 /* first */, param2 /* order */); break; case RES_MAC: + vhcr-in_param |= (u64) (pf_num) 48; switch (vhcr-op) { case MLX4_CMD_ALLOC_RES: ret = mlx4_register_mac(dev, vhcr-op_modifier, @@ -1096,6 +1099,11 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, u16 para if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2) goto reset_slave; slave_state[slave].vhcr_dma |= param; + if (mlx4_QUERY_FUNC(dev, slave, slave_state[slave].pf_num)) { + mlx4_err(dev, Failed to determine physical function + number for slave %d\n, slave); + goto reset_slave; + } break; case MLX4_COMM_CMD_VHCR_POST: if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index d1427e5..55377c0 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -152,9 +152,13 @@ int mlx4_QUERY_SLAVE_CAP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh memcpy(caps, dev-caps, sizeof *caps); + /* Ports are activated according to physical function number */ + mlx4_set_port_mask(dev, caps, slave); + /* PDs have the same range in every guest; the distinction is in the msbs, * which contains the guest ID (vf + 1) */ caps-pd_base = slave + 1; + caps-function = slave; /* All other resources are allocated by the master, but we still report * 'num' and 'reserved' capabilities as follows: @@ -596,6 +600,7 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) #define QUERY_FW_OUT_SIZE 0x100 #define QUERY_FW_VER_OFFSET0x00 +#define QUERY_FW_PPF_ID 0x09 #define QUERY_FW_CMD_IF_REV_OFFSET 0x0a #define QUERY_FW_MAX_CMD_OFFSET0x0f #define QUERY_FW_ERR_START_OFFSET 0x30 @@ -628,6 +633,9 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) ((fw_ver 0xull) 16) | ((fw_ver 0xull) 16); + MLX4_GET(lg, outbox, QUERY_FW_PPF_ID); + dev-caps.function = lg; + MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET); if (cmd_if_rev MLX4_COMMAND_INTERFACE_MIN_REV || cmd_if_rev MLX4_COMMAND_INTERFACE_MAX_REV) { @@ -938,7 +946,8 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) mlx4_free_cmd_mailbox(dev, mailbox); } else { if (mlx4_is_master(dev)) - err = mlx4_common_init_port(dev, 0, port); + err = mlx4_common_init_port(dev, dev-caps.function, + port); else err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, MLX4_CMD_TIME_CLASS_A); @@ -978,7 +987,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *v int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port) { if (mlx4_is_master(dev)) - return mlx4_common_close_port(dev, 0, port); + return
[PATCH 2.6.35 1/3] RDMA/cxgb4: Don't call abort_connection() for active connect failures.
Signed-off-by: Steve Wise sw...@opengridcomputing.com --- drivers/infiniband/hw/cxgb4/cm.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 30ce0a8..3e15a07 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -969,7 +969,8 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) goto err; goto out; err: - abort_connection(ep, skb, GFP_KERNEL); + state_set(ep-com, ABORTING); + send_abort(ep, skb, GFP_KERNEL); out: connect_reply_upcall(ep, err); return; -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2.6.35 2/3] RDMA/cxgb4: Support variable sized work requests.
T4 EQ entries are in multiples of 64B. Currently the RDMA SQ and RQ use fixed sized entries composed of 4 EQ entries for the SQ and 2 EQ entries for the RQ. For optimial latency with small IO, we need to change this so the HW only needs to DMA the EQ entries actually used by a given work request. Implementation: - add wq_pidx counter to track where we are in the EQ. cidx/pidx are used for the sw sq/rq tracking and flow control. - the variable part of work requests is the SGL. Add new functions to build the SGL and/or immediate data directly in the EQ memory wrapping when needed. - adjust the min burst size for the EQ contexts to 64B. Signed-off-by: Steve Wise sw...@opengridcomputing.com --- drivers/infiniband/hw/cxgb4/qp.c | 220 -- drivers/infiniband/hw/cxgb4/t4.h | 32 +++--- 2 files changed, 130 insertions(+), 122 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 0c28ed1..7d87fe5 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -162,7 +162,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, res-u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | V_FW_RI_RES_WR_DCACPU(0) | - V_FW_RI_RES_WR_FBMIN(3) | + V_FW_RI_RES_WR_FBMIN(2) | V_FW_RI_RES_WR_FBMAX(3) | V_FW_RI_RES_WR_CIDXFTHRESHO(0) | V_FW_RI_RES_WR_CIDXFTHRESH(0) | @@ -185,7 +185,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, res-u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | V_FW_RI_RES_WR_DCACPU(0) | - V_FW_RI_RES_WR_FBMIN(3) | + V_FW_RI_RES_WR_FBMIN(2) | V_FW_RI_RES_WR_FBMAX(3) | V_FW_RI_RES_WR_CIDXFTHRESHO(0) | V_FW_RI_RES_WR_CIDXFTHRESH(0) | @@ -235,12 +235,78 @@ err1: return -ENOMEM; } -static int build_rdma_send(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, + struct ib_send_wr *wr, int max, u32 *plenp) { + u8 *dstp, *srcp; + u32 plen = 0; int i; + int rem, len; + + dstp = (u8 *)immdp-data; + for (i = 0; i wr-num_sge; i++) { + if ((plen + wr-sg_list[i].length) max) + return -EMSGSIZE; + srcp = (u8 *)(unsigned long)wr-sg_list[i].addr; + plen += wr-sg_list[i].length; + rem = wr-sg_list[i].length; + while (rem) { + if (dstp == (u8 *)sq-queue[sq-size]) + dstp = (u8 *)sq-queue; + if (rem = (u8 *)sq-queue[sq-size] - dstp) + len = rem; + else + len = (u8 *)sq-queue[sq-size] - dstp; + memcpy(dstp, srcp, len); + dstp += len; + srcp += len; + rem -= len; + } + } + immdp-op = FW_RI_DATA_IMMD; + immdp-r1 = 0; + immdp-r2 = 0; + immdp-immdlen = cpu_to_be32(plen); + *plenp = plen; + return 0; +} + +static int build_isgl(__be64 *queue_start, __be64 *queue_end, + struct fw_ri_isgl *isglp, struct ib_sge *sg_list, + int num_sge, u32 *plenp) + +{ + int i; + u32 plen = 0; + __be64 *flitp = (__be64 *)isglp-sge; + + for (i = 0; i num_sge; i++) { + if ((plen + sg_list[i].length) plen) + return -EMSGSIZE; + plen += sg_list[i].length; + *flitp = cpu_to_be64(((u64)sg_list[i].lkey 32) | +sg_list[i].length); + if (++flitp == queue_end) + flitp = queue_start; + *flitp = cpu_to_be64(sg_list[i].addr); + if (++flitp == queue_end) + flitp = queue_start; + } + isglp-op = FW_RI_DATA_ISGL; + isglp-r1 = 0; + isglp-nsge = cpu_to_be16(num_sge); + isglp-r2 = 0; + if (plenp) + *plenp = plen; + return 0; +} + +static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) +{ u32 plen; int size; - u8 *datap; + int ret; if (wr-num_sge T4_MAX_SEND_SGE) return -EINVAL; @@ -267,43 +333,23 @@ static int build_rdma_send(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) default: return -EINVAL; } + plen = 0; if (wr-num_sge) { if (wr-send_flags IB_SEND_INLINE) { - datap = (u8 *)wqe-send.u.immd_src[0].data; - for
[PATCH 2.6.35 3/3] RDMA/cxgb4: Avoid false GTS CIDX_INC overflows.
The T4 IQ hw design assumes CIDX_INC credits will be returned on a regular basis and always before the CIDX counter crosses over the PIDX counter. For RDMA CQs, however, returning CIDX_INC credits is only needed and desired when and if the CQ is armed for notification. This can lead to a GTS write returning credits that causes the HW to reject the credit update because it causes CIDX to pass PIDX. Once this happens, the CIDX/PIDX counters get out of whack and an application can miss a notification and get stuck blocked awaiting a notification. To avoid this, we allocate the HW IQ 2x times the requested size. This seems to avoid the false overflow failures. If we see more issues with this, then we'll have to add code in the poll path to return credits periodically like when the amount reaches 1/2 the queue depth). I would like to avoid this as it adds a PCI write transaction for applications that never arm the CQ (like most MPIs). Signed-off-by: Steve Wise sw...@opengridcomputing.com --- drivers/infiniband/hw/cxgb4/cq.c | 25 - 1 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 2447f52..4311501 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -764,7 +764,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, struct c4iw_create_cq_resp uresp; struct c4iw_ucontext *ucontext = NULL; int ret; - size_t memsize; + size_t memsize, hwentries; struct c4iw_mm_entry *mm, *mm2; PDBG(%s ib_dev %p entries %d\n, __func__, ibdev, entries); @@ -788,14 +788,29 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, * entries must be multiple of 16 for HW. */ entries = roundup(entries, 16); - memsize = entries * sizeof *chp-cq.queue; + + /* +* Make actual HW queue 2x to avoid cdix_inc overflows. +*/ + hwentries = entries * 2; + + /* +* Make HW queue at least 64 entries so GTS updates aren't too +* frequent. +*/ + if (hwentries 64) + hwentries = 64; + + memsize = hwentries * sizeof *chp-cq.queue; /* * memsize must be a multiple of the page size if its a user cq. */ - if (ucontext) + if (ucontext) { memsize = roundup(memsize, PAGE_SIZE); - chp-cq.size = entries; + hwentries = memsize / sizeof *chp-cq.queue; + } + chp-cq.size = hwentries; chp-cq.memsize = memsize; ret = create_cq(rhp-rdev, chp-cq, @@ -805,7 +820,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, chp-rhp = rhp; chp-cq.size--; /* status page */ - chp-ibcq.cqe = chp-cq.size - 1; + chp-ibcq.cqe = entries - 2; spin_lock_init(chp-lock); atomic_set(chp-refcnt, 1); init_waitqueue_head(chp-wait); -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/19 V4] mlx4_core: slave multicast support
Multicast table processing requires multiple related commands. To keep things simple, low-level multicast handling is done only by the master; a new virtual command is added to allow slaves to attach/detach QPs to mulitcast groups at a higher abstraction level. The multicast attachment mechanism is used both by IB and Ethernet, so we need to specify for each multicast address (whether it is gid or mac) its protocol. For ethernet addresses, their VEP number should be specified. This field is set according device capabilities. Search and hash calculation is also done according to this field. A Ethernet ports now need to register to the multicast groups, we can not longer use the default multicast queue per port because of the multiple clients per port. Signed-off-by: Liran Liss lir...@mellanox.co.il Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/infiniband/hw/mlx4/main.c |6 +- drivers/net/mlx4/cmd.c|8 +++ drivers/net/mlx4/en_netdev.c | 42 - drivers/net/mlx4/en_port.c|4 +- drivers/net/mlx4/en_port.h|5 ++ drivers/net/mlx4/fw.c |4 + drivers/net/mlx4/fw.h |2 + drivers/net/mlx4/main.c |2 + drivers/net/mlx4/mcg.c| 127 +++- drivers/net/mlx4/mlx4.h |4 + drivers/net/mlx4/port.c |4 +- include/linux/mlx4/cmd.h |1 + include/linux/mlx4/device.h | 14 - 13 files changed, 196 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4e94e36..2c28f98 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -452,13 +452,15 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) return mlx4_multicast_attach(to_mdev(ibqp-device)-dev, to_mqp(ibqp)-mqp, gid-raw, !!(to_mqp(ibqp)-flags - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)); + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), +MLX4_PROT_IB_IPV6); } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return mlx4_multicast_detach(to_mdev(ibqp-device)-dev, -to_mqp(ibqp)-mqp, gid-raw); +to_mqp(ibqp)-mqp, gid-raw, +MLX4_PROT_IB_IPV6); } static int init_node_data(struct mlx4_ib_dev *dev) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index b25e40e..0f45fde 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -903,6 +903,14 @@ static struct mlx4_cmd_info { /* Native multicast commands are not available for guests */ { + .opcode = MLX4_CMD_MCAST_ATTACH, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .verify = NULL, + .wrapper = mlx4_MCAST_wrapper + }, + { .opcode = MLX4_CMD_DIAG_RPRT, .has_inbox = false, .has_outbox = true, diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index 7389fa2..5ebe135 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -173,6 +173,7 @@ static void mlx4_en_cache_mclist(struct net_device *dev) int mc_addrs_cnt = netdev_mc_count(dev); int i; + mlx4_en_clear_list(dev); mc_addrs = kmalloc(mc_addrs_cnt * ETH_ALEN, GFP_ATOMIC); if (!mc_addrs) { en_err(priv, failed to allocate multicast list\n); @@ -203,6 +204,7 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) struct mlx4_en_dev *mdev = priv-mdev; struct net_device *dev = priv-dev; u64 mcast_addr = 0; + u8 mc_list[16] = {0}; int err; mutex_lock(mdev-state_lock); @@ -284,6 +286,14 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) if (err) en_err(priv, Failed disabling multicast filter\n); + /* Detach our qp from all the multicast addresses */ + for (i = 0; i priv-mc_addrs_cnt; i++) { + memcpy(mc_list[10], priv-mc_addrs + i * ETH_ALEN, + ETH_ALEN); + mc_list[7] = (priv-port - 1) 4; + mlx4_multicast_detach(mdev-dev, priv-rss_map.indir_qp, + mc_list, MLX4_PROT_ETH); + } /* Flush mcast filter and init it with broadcast address */ mlx4_SET_MCAST_FLTR(mdev-dev, priv-port, ETH_BCAST, 1, MLX4_MCAST_CONFIG); @@ -294,6 +304,11 @@ static void
Re: InfiniBand/RDMA merge plans for 2.6.35
On 06/09/2010 11:08 AM, Roland Dreier wrote: Please also pick up the 3-patch set Least attached vector support from Yevgeny on 2010-5-13? RDS changes depend on these. It's now post -rc2, so these obviously wait for 2.6.36 at best. However, I haven't replied to these patches in detail but in general I don't like this approach of pick a random vector since it is non-deterministic and not likely to end up with an optimal result. What is the optimal way to do this, if it isn't to spread CQs evenly across all available vectors? (least attached vector != random.) I guess we'll just round-robin modulo caps.num_comp_vectors for now, but I do think this should be up to the hca, not the ulp, since the ULP has no visibility into other ulp's usage of vectors. Regards -- Andy -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: converting ib-mgmt scripts to executables
As for ibchecknet, under what use case do you see people running it? Could ibnetdiscover or iblinkinfo provide the same functionality? I'll check on how it is being used and whether the other calls would work just as well. Thanks, - Sean -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: converting ib-mgmt scripts to executables
On Thu, 10 Jun 2010 09:04:01 -0700 Hefty, Sean sean.he...@intel.com wrote: Sasha, Before we do the work, would there be any issue converting a couple of the ib-diag scripts to executables? Specifically, we'd like to have ibchecknet and ibcheckerrors functionality available on Windows. Here at LLNL we were thinking it would be best to start removing some of the scripts to reduce the confusion about what tools do. For example, there is some confusion among our users as to what ibcheckerrs, ibcheckerrors, and ibqueryerrors does? AFAICT ibcheckerrors and ibqueryerrors only differ in 3 respects. 1) the formated output is different 2) ibcheckerrors calls ibcheckerrs which ignores error counts which are below a threshold either hard coded or specified by a file. 3) ibcheckerrs defaults to use AllPortSelect which may result in a faster scan. [*] [*] I have a patch for ibqueryerrors which querys AllPortSelect first and only issues individual queries if it sees errors. However, frankly it did not seem to speed up the scan on our large clusters so I don't know if this is a big difference between ibcheckerrors and ibqueryerrors.) 2 is very easy to add to ibqueryerrors, I don't think 1 would be that hard, and 3 is basically done. As for ibchecknet, under what use case do you see people running it? Could ibnetdiscover or iblinkinfo provide the same functionality? Ira - Sean -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://*vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/19 V4] mlx4_core: Giving Mac addresses for slave functions.
For physical slaves, Mac address is retreived from static configuration. For virtual slaves get random Macs. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/fw.c | 64 + drivers/net/mlx4/fw.h |6 2 files changed, 70 insertions(+), 0 deletions(-) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index e53a392..773de63 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -32,6 +32,7 @@ * SOFTWARE. */ +#include linux/etherdevice.h #include linux/mlx4/cmd.h #include linux/cache.h @@ -136,6 +137,45 @@ int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg) return err; } +int mlx4_QUERY_VEP_CFG(struct mlx4_dev *dev, u8 vep_num, + struct mlx4_vep_cfg *cfg) +{ + int err; + u32 in_mod; + u64 output; + +#define QUERY_VEP_CFG_OPMOD3 + +#define QUERY_VEP_CFG_INMOD(2 28) +#define QUERY_VEP_CFG_INMOD_VEP_OFFSET 16 + +#define QUERY_VEP_CFG_MAC_OFFSET 0x90 +#define QUERY_VEP_CFG_LINK_OFFSET 0xa0 + + + in_mod = QUERY_VEP_CFG_INMOD | (vep_num QUERY_VEP_CFG_INMOD_VEP_OFFSET); + + err = mlx4_cmd_imm(dev, 0, output, in_mod | QUERY_VEP_CFG_MAC_OFFSET, + QUERY_VEP_CFG_OPMOD, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A); + if (err) { + mlx4_err(dev, Failed to retrieve mac for function %d\n, vep_num); + return err; + } + cfg-mac = output 0xULL; + + err = mlx4_cmd_imm(dev, 0, output, in_mod | QUERY_VEP_CFG_LINK_OFFSET, + QUERY_VEP_CFG_OPMOD, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A); + if (err) { + mlx4_err(dev, Failed to retrieve link for function %d\n, vep_num); + return err; + } + cfg-link = (output 32) 1; + + return 0; +} + int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox) @@ -148,10 +188,34 @@ int mlx4_QUERY_SLAVE_CAP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox) { + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mfunc_master_ctx *master = priv-mfunc.master; + struct mlx4_slave_state *slave_st = master-slave_state[slave]; struct mlx4_caps *caps = outbox-buf; + struct mlx4_vep_cfg cfg; + u8 pf_num = slave_st-pf_num; + u8 rand_mac[6]; + int i, j, err = 0; memcpy(caps, dev-caps, sizeof *caps); + /* For physical functions Mac should be defined by fw */ + if (pf_num == slave) { + err = mlx4_QUERY_VEP_CFG(dev, pf_num, cfg); + if (err) + mlx4_warn(dev, Failed to retreive mac address for vep %d\n, pf_num); + else + caps-def_mac[(pf_num 1) + 1] = cfg.mac; + } + if (pf_num != slave || err) { + for (i = 1; i = dev-caps.num_ports; ++i) { + random_ether_addr(rand_mac); + caps-def_mac[i] = 0; + for (j = 0; j ETH_ALEN; j++) + caps-def_mac[i] |= ((u64)(rand_mac[1]) 8 * j); + } + } + /* Ports are activated according to physical function number */ mlx4_set_port_mask(dev, caps, slave); diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index d5c17cf..f8d49d0 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -43,6 +43,11 @@ struct mlx4_mod_stat_cfg { u8 log_pg_sz_m; }; +struct mlx4_vep_cfg { + u64 mac; + u8 link; +}; + struct mlx4_dev_cap { int max_srq_sz; int max_qp_sz; @@ -180,6 +185,7 @@ int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); int mlx4_NOP(struct mlx4_dev *dev); int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg); +int mlx4_QUERY_VEP_CFG(struct mlx4_dev *dev, u8 vep_num, struct mlx4_vep_cfg *cfg); int mlx4_QUERY_FUNC(struct mlx4_dev *dev, int func, u8 *pf_num); #endif /* MLX4_FW_H */ -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/19 V4] mlx4_core: Managing common port configuration by master function
The Multicast filter configuration is done by the master, that manages the filter which is common for all the functions. The master holds a list of multicast addresses for all the slaves, and adds them to the filter. In case some slave wishes to flush the filter, only his addresses are removed. The VLAN filter is a bitwise OR of all the VLAN filters for all functions, the result is a false-positive filter. All port configuration is moved to the mlx4_core module Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 41 +-- drivers/net/mlx4/en_port.c | 105 -- drivers/net/mlx4/en_port.h | 47 -- drivers/net/mlx4/mlx4.h | 63 + drivers/net/mlx4/mlx4_en.h |1 - drivers/net/mlx4/port.c | 326 +-- include/linux/mlx4/device.h |6 + 7 files changed, 416 insertions(+), 173 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 0f45fde..4cfa407 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -926,7 +926,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, - .wrapper = NULL /* need wrapper*/ + .wrapper = mlx4_SET_VLAN_FLTR_wrapper }, { .opcode = MLX4_CMD_SET_MCAST_FLTR, @@ -934,7 +934,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, - .wrapper = NULL /* need wrapper*/ + .wrapper = mlx4_SET_MCAST_FLTR_wrapper }, { .opcode = MLX4_CMD_DUMP_ETH_STATS, @@ -1170,7 +1170,8 @@ static void mlx4_master_poll_comm(struct work_struct *work) int mlx4_multi_func_init(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - int i; + struct mlx4_slave_state *s_state; + int i, port; priv-mfunc.vhcr = dma_alloc_coherent((dev-pdev-dev), PAGE_SIZE, priv-mfunc.vhcr_dma, @@ -1202,16 +1203,27 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) goto err_comm; for (i = 0; i dev-num_slaves; ++i) { - priv-mfunc.master.slave_state[i].last_cmd = MLX4_COMM_CMD_RESET; - spin_lock_init(priv-mfunc.master.slave_state[i].lock); + s_state = priv-mfunc.master.slave_state[i]; + s_state-last_cmd = MLX4_COMM_CMD_RESET; + for (port = 1; port = MLX4_MAX_PORTS; port++) { + s_state-vlan_filter[port] = + kzalloc(sizeof(struct mlx4_vlan_fltr), + GFP_KERNEL); + if (!s_state-vlan_filter[port]) { + if (--port) + kfree(s_state-vlan_filter[port]); + goto err_slaves; + } + INIT_LIST_HEAD(s_state-mcast_filters[port]); + } + spin_lock_init(s_state-lock); } INIT_DELAYED_WORK(priv-mfunc.comm_work, mlx4_master_poll_comm); priv-mfunc.comm_wq = create_singlethread_workqueue(mlx4_comm); - if (!priv-mfunc.comm_wq) { - kfree(priv-mfunc.master.slave_state); - goto err_comm; - } + if (!priv-mfunc.comm_wq) + goto err_slaves; + } else { priv-cmd.comm_toggle = 0; INIT_DELAYED_WORK(priv-mfunc.comm_work, mlx4_slave_async_eq_poll); @@ -1221,6 +1233,12 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) } return 0; +err_slaves: + while (--i) { + for (port = 1; port = MLX4_MAX_PORTS; port++) + kfree(priv-mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv-mfunc.master.slave_state); err_comm: iounmap(priv-mfunc.comm); err_vhcr: @@ -1269,9 +1287,14 @@ err_hcr: void mlx4_multi_func_cleanup(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); + int i, port; if (priv-mfunc.vhcr) { destroy_workqueue(priv-mfunc.comm_wq); + for (i = 0; i dev-num_slaves; i++) { + for (port = 1; port = MLX4_MAX_PORTS; port++) + kfree(priv-mfunc.master.slave_state[i].vlan_filter[port]); + } kfree(priv-mfunc.master.slave_state); iounmap(priv-mfunc.comm); dma_free_coherent((dev-pdev-dev), PAGE_SIZE, diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c index
[PATCH 16/19 V4] mlx4_en: querying link state
In multifunction device, a certain function can initialize its port after some other function already done that. In that case link event would not be generated. Need to query the port to retrieve the port state. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/en_netdev.c | 10 ++ drivers/net/mlx4/en_port.c | 32 drivers/net/mlx4/en_port.h | 13 + drivers/net/mlx4/mlx4_en.h |8 4 files changed, 63 insertions(+), 0 deletions(-) diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index 5ebe135..d171945 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -219,6 +219,16 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) goto out; } + if (!netif_carrier_ok(dev)) { + if (!mlx4_en_QUERY_PORT(mdev, priv-port)) { + if (priv-port_state.link_state) { + priv-last_link_state = MLX4_DEV_EVENT_PORT_UP; + netif_carrier_on(dev); + en_dbg(LINK, priv, Link Up\n); + } + } + } + /* * Promsicuous mode: disable all filters */ diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c index 2863a30..84cc32d 100644 --- a/drivers/net/mlx4/en_port.c +++ b/drivers/net/mlx4/en_port.c @@ -40,6 +40,38 @@ #include en_port.h #include mlx4_en.h +int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) +{ + struct mlx4_en_query_port_context *qport_context; + struct mlx4_en_priv *priv = netdev_priv(mdev-pndev[port]); + struct mlx4_en_port_state *state = priv-port_state; + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(mdev-dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + memset(mailbox-buf, 0, sizeof(*qport_context)); + err = mlx4_cmd_box(mdev-dev, 0, mailbox-dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B); + if (err) + goto out; + qport_context = mailbox-buf; + + /* This command is always accessed from Ethtool context +* already synchronized, no need in locking */ + state-link_state = !!(qport_context-link_up MLX4_EN_LINK_UP_MASK); + if ((qport_context-link_speed MLX4_EN_SPEED_MASK) == + MLX4_EN_1G_SPEED) + state-link_speed = 1000; + else + state-link_speed = 1; + state-transciver = qport_context-transceiver; + +out: + mlx4_free_cmd_mailbox(mdev-dev, mailbox); + return err; +} int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) { diff --git a/drivers/net/mlx4/en_port.h b/drivers/net/mlx4/en_port.h index 40918ab..ecbab85 100644 --- a/drivers/net/mlx4/en_port.h +++ b/drivers/net/mlx4/en_port.h @@ -41,6 +41,19 @@ enum { MLX4_MCAST_ENABLE = 2, }; +struct mlx4_en_query_port_context { + u8 link_up; +#define MLX4_EN_LINK_UP_MASK 0x80 + u8 reserved; + __be16 mtu; + u8 reserved2; + u8 link_speed; +#define MLX4_EN_SPEED_MASK 0x3 +#define MLX4_EN_1G_SPEED 0x2 + u16 reserved3[5]; + __be64 mac; + u8 transceiver; +}; struct mlx4_en_stat_out_mbox { /* Received frames with a length of 64 octets */ diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h index 2204ec3..e0ce6c5 100644 --- a/drivers/net/mlx4/mlx4_en.h +++ b/drivers/net/mlx4/mlx4_en.h @@ -389,6 +389,12 @@ struct mlx4_en_rss_context { __be32 rss_key[10]; }; +struct mlx4_en_port_state { + int link_state; + int link_speed; + int transciver; +}; + struct mlx4_en_pkt_stats { unsigned long broadcast; unsigned long rx_prio[8]; @@ -437,6 +443,7 @@ struct mlx4_en_priv { struct vlan_group *vlgrp; struct net_device_stats stats; struct net_device_stats ret_stats; + struct mlx4_en_port_state port_state; spinlock_t stats_lock; unsigned long last_moder_packets; @@ -562,6 +569,7 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, u8 promisc); int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset); +int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port); /* * Globals -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 18/19 V4] mlx4_core: setting MGM entry size to 512
Both Unicast and multicast addresses are using the same table Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/mlx4.h |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 8530032..7a7f787 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -64,7 +64,7 @@ enum { }; enum { - MLX4_MGM_ENTRY_SIZE = 0x100, + MLX4_MGM_ENTRY_SIZE = 0x200, MLX4_QP_PER_MGM = 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2), MLX4_MTT_ENTRY_PER_SEG = 8 }; -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 19/19 V4] mlx4: Communication channel interrupts
The master function receives am interrupt each time on eof the slaves writes to the communication channel, and then handles the channel from a deffered task. The slaves can now receive command completions an async events by interrupts. The open an EQ and the master generates eqes and pushes them to the slave's EQ using Firmware GEN_EQE command. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 258 +++--- drivers/net/mlx4/eq.c | 296 +++ drivers/net/mlx4/main.c | 50 +++- drivers/net/mlx4/mlx4.h | 97 +-- include/linux/mlx4/cmd.h|7 +- include/linux/mlx4/device.h |1 + 6 files changed, 459 insertions(+), 250 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 7efa85f..083ae0f 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -148,19 +148,11 @@ static int comm_pending(struct mlx4_dev *dev) return (swab32(status) 30) != priv-cmd.comm_toggle; } -int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout) +static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param) { struct mlx4_priv *priv = mlx4_priv(dev); - unsigned long end; u32 val; - /* First, verify that the master reports correct status */ - if (comm_pending(dev)) { - mlx4_warn(dev, Communication channel is not idle\n); - return -EAGAIN; - } - - /* Write command */ if (cmd == MLX4_COMM_CMD_RESET) priv-cmd.comm_toggle = 0; else if (++priv-cmd.comm_toggle 2) @@ -168,6 +160,23 @@ int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout val = param | (cmd 16) | (priv-cmd.comm_toggle 30); __raw_writel((__force u32) cpu_to_be32(val), priv-mfunc.comm-slave_write); wmb(); +} + +int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + unsigned long end; + int err = 0; + + /* First, verify that the master reports correct status */ + if (comm_pending(dev)) { + mlx4_warn(dev, Communication channel is not idle\n); + return -EAGAIN; + } + + /* Write command */ + down(priv-cmd.poll_sem); + mlx4_comm_cmd_post(dev, cmd, param); end = msecs_to_jiffies(timeout) + jiffies; while (comm_pending(dev) time_before(jiffies, end)) @@ -175,11 +184,57 @@ int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout if (comm_pending(dev)) { mlx4_warn(dev, Communication channel timed out\n); - return -ETIMEDOUT; + err = -ETIMEDOUT; } + + up(priv-cmd.poll_sem); return 0; } +static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op, + u16 param, unsigned long timeout) +{ + struct mlx4_cmd *cmd = mlx4_priv(dev)-cmd; + struct mlx4_cmd_context *context; + int err = 0; + + down(cmd-event_sem); + + spin_lock(cmd-context_lock); + BUG_ON(cmd-free_head 0); + context = cmd-context[cmd-free_head]; + context-token += cmd-token_mask + 1; + cmd-free_head = context-next; + spin_unlock(cmd-context_lock); + + init_completion(context-done); + + mlx4_comm_cmd_post(dev, op, param); + + if (!wait_for_completion_timeout(context-done, msecs_to_jiffies(timeout))) { + err = -EBUSY; + goto out; + } + + err = context-result; + +out: + spin_lock(cmd-context_lock); + context-next = cmd-free_head; + cmd-free_head = context - cmd-context; + spin_unlock(cmd-context_lock); + + up(cmd-event_sem); + return err; +} + +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout) +{ + if (mlx4_priv(dev)-cmd.use_events) + return mlx4_comm_cmd_wait(dev, cmd, param, timeout); + return mlx4_comm_cmd_poll(dev, cmd, param, timeout); +} + static int cmd_pending(struct mlx4_dev *dev) { u32 status = readl(mlx4_priv(dev)-cmd.hcr + HCR_STATUS_OFFSET); @@ -247,15 +302,15 @@ out: return ret; } -static int mlx4_slave_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, -int out_is_imm, u32 in_modifier, u8 op_modifier, -u16 op, unsigned long timeout) +static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_vhcr *vhcr = priv-mfunc.vhcr; int ret; - down(priv-cmd.poll_sem); + down(priv-cmd.slave_sem);
Re: [PATCH 2.6.35 2/3] RDMA/cxgb4: Support variable sized work requests.
T4 EQ entries are in multiples of 64B. Currently the RDMA SQ and RQ use fixed sized entries composed of 4 EQ entries for the SQ and 2 EQ entries for the RQ. For optimial latency with small IO, we need to change this so the HW only needs to DMA the EQ entries actually used by a given work request. This seems not to be a fix, just an optimization -- so at this point for 2.6.36 I think. Or am I wrong? -- Roland Dreier rola...@cisco.com || For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2.6.35 2/3] RDMA/cxgb4: Support variable sized work requests.
Linus has been being a hard-ass this cycle about quieting things down post -rc2. So I'll hold off. -- Roland Dreier rola...@cisco.com || For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 17/19 V4] mlx4: Using mcg tables for Ethernet Unicast steering.
When there are multiple interfaces on the same physical port, the old steering model (Mac steering) would not work. The reason is in the old model there could be only one promiscuous QP per port. With the new mechanism each interface can have promisc entries both for unicast and multicast. A promisc QP is registered to all entries that belong to the same port and also for the default entry. In fw where this feature is not supported, using the Mac table steering. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 10 +- drivers/net/mlx4/en_netdev.c | 55 - drivers/net/mlx4/fw.c|3 + drivers/net/mlx4/main.c | 67 +- drivers/net/mlx4/mcg.c | 611 ++ drivers/net/mlx4/mlx4.h | 30 ++ drivers/net/mlx4/mlx4_en.h |1 + drivers/net/mlx4/port.c | 91 ++- include/linux/mlx4/cmd.h |1 + include/linux/mlx4/device.h | 12 +- 10 files changed, 821 insertions(+), 60 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 660d001..7efa85f 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -514,7 +514,7 @@ static int mlx4_RESOURCE_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh switch (vhcr-op) { case MLX4_CMD_ALLOC_RES: ret = mlx4_register_mac(dev, vhcr-op_modifier, - vhcr-in_param, (int *) vhcr-out_param); + vhcr-in_param, (int *) vhcr-out_param, 1); vhcr-errno = ret; break; case MLX4_CMD_FREE_RES: @@ -937,6 +937,14 @@ static struct mlx4_cmd_info { .wrapper = mlx4_MCAST_wrapper }, { + .opcode = MLX4_CMD_PROMISC, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .verify = NULL, + .wrapper = mlx4_PROMISC_wrapper + }, + { .opcode = MLX4_CMD_DIAG_RPRT, .has_inbox = false, .has_outbox = true, diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index d171945..6dd47e8 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -240,8 +240,12 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) priv-flags |= MLX4_EN_FLAG_PROMISC; /* Enable promiscouos mode */ - err = mlx4_SET_PORT_qpn_calc(mdev-dev, priv-port, -priv-base_qpn, 1); + if (!mdev-dev-caps.vep_uc_steering) + err = mlx4_SET_PORT_qpn_calc(mdev-dev, priv-port, +priv-base_qpn, 1); + else + err = mlx4_unicast_promisc_add(mdev-dev, priv-base_qpn, + priv-port - 1); if (err) en_err(priv, Failed enabling promiscous mode\n); @@ -253,6 +257,15 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) en_err(priv, Failed disabling multicast filter\n); + /* Add the default qp number as multicast promisc */ + if (!(priv-flags MLX4_EN_FLAG_MC_PROMISC)) { + err = mlx4_multicast_promisc_add(mdev-dev, priv-base_qpn, +priv-port - 1); + if (err) + en_err(priv, Failed entering multicast promisc mode\n); + priv-flags |= MLX4_EN_FLAG_MC_PROMISC; + } + /* Disable port VLAN filter */ err = mlx4_SET_VLAN_FLTR(mdev-dev, priv-port, NULL); if (err) @@ -271,11 +284,24 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) priv-flags = ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ - err = mlx4_SET_PORT_qpn_calc(mdev-dev, priv-port, -priv-base_qpn, 0); + if (!mdev-dev-caps.vep_uc_steering) + err = mlx4_SET_PORT_qpn_calc(mdev-dev, priv-port, +priv-base_qpn, 0); + else + err = mlx4_unicast_promisc_remove(mdev-dev, priv-base_qpn, + priv-port - 1); if (err)
Re: [PATCH 2.6.35 2/3] RDMA/cxgb4: Support variable sized work requests.
Roland Dreier wrote: T4 EQ entries are in multiples of 64B. Currently the RDMA SQ and RQ use fixed sized entries composed of 4 EQ entries for the SQ and 2 EQ entries for the RQ. For optimial latency with small IO, we need to change this so the HW only needs to DMA the EQ entries actually used by a given work request. This seems not to be a fix, just an optimization -- so at this point for 2.6.36 I think. Or am I wrong? You are correct...I was hoping since iw_cxgb4 is new to 2.6.35, we could still get this in. But if you disagree, then 2.6.36... -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: InfiniBand/RDMA merge plans for 2.6.35
However, I haven't replied to these patches in detail but in general I don't like this approach of pick a random vector since it is non-deterministic and not likely to end up with an optimal result. What is the optimal way to do this, if it isn't to spread CQs evenly across all available vectors? (least attached vector != random.) Since there is no way to know whether a given vector has a bunch of CQs that generate very few events or maybe a single CQ that generates a heavy load of events, the number of attached CQs is really pretty useless as a basis to decide. I think it's much better to try and attach your CQ to a vector that is directed at the CPU where you want to process the work. - R. -- Roland Dreier rola...@cisco.com || For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/19 V4] mlx4_core: Adding VEP number in resource allocation
The firmware should be aware of the function a resource is opened on. The function number is passed in the lower bits of input paramater. Non multi-function firmware masks these bits. Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il --- drivers/net/mlx4/cmd.c | 58 ++- 1 files changed, 42 insertions(+), 16 deletions(-) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 4cfa407..660d001 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -534,6 +534,32 @@ static int mlx4_RESOURCE_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vh return 0; } +static int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + u64 in_param = inbox ? inbox-dma : vhcr-in_param; + + in_param |= (u64) slave; + return mlx4_cmd(dev, in_param, vhcr-in_modifier, + vhcr-op_modifier, vhcr-op, MLX4_CMD_TIME_CLASS_C); +} + +static int mlx4_DMA_outbox_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + u64 in_param = inbox ? inbox-dma : vhcr-in_param; + u64 out_param = outbox ? outbox-dma : vhcr-out_param; + + in_param |= (u64) slave; + return mlx4_cmd_box(dev, in_param, out_param, + vhcr-in_modifier, vhcr-op_modifier, vhcr-op, + MLX4_CMD_TIME_CLASS_C); +} + static struct mlx4_cmd_info { u16 opcode; bool has_inbox; @@ -608,7 +634,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /*need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_NOP, @@ -657,7 +683,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_QUERY_MPT, @@ -703,10 +729,10 @@ static struct mlx4_cmd_info { { .opcode = MLX4_CMD_HW2SW_EQ, .has_inbox = false, - .has_outbox = false, + .has_outbox = true, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_outbox_wrapper }, { .opcode = MLX4_CMD_QUERY_EQ, @@ -722,7 +748,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_HW2SW_CQ, @@ -730,7 +756,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_QUERY_CQ, @@ -754,7 +780,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_HW2SW_SRQ, @@ -786,7 +812,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_INIT2RTR_QP, @@ -794,7 +820,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_RTR2RTS_QP, @@ -802,7 +828,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL + .wrapper = mlx4_DMA_wrapper }, { .opcode = MLX4_CMD_RTS2RTS_QP, @@ -810,7 +836,7 @@ static struct mlx4_cmd_info { .has_outbox = false, .out_is_imm = false, .verify = NULL, /* need verifier */ - .wrapper = NULL +
[PATCH] ibqueryerrors.c: Optimize by querying AllPortSelect first
From: Ira Weiny wei...@hera2.llnl.gov Date: Thu, 6 May 2010 13:49:55 -0700 Subject: [PATCH] ibqueryerrors.c: Optimize by querying AllPortSelect first If errors are seen with AllPortSelect query individual ports for more details. Signed-off-by: Ira Weiny wei...@llnl.gov --- infiniband-diags/src/ibqueryerrors.c | 55 +++-- 1 files changed, 38 insertions(+), 17 deletions(-) diff --git a/infiniband-diags/src/ibqueryerrors.c b/infiniband-diags/src/ibqueryerrors.c index f04e47f..e0b1c0b 100644 --- a/infiniband-diags/src/ibqueryerrors.c +++ b/infiniband-diags/src/ibqueryerrors.c @@ -245,9 +245,9 @@ static int query_and_dump(char *buf, size_t size, ib_portid_t * portid, return n; } -static void print_results(ib_portid_t * portid, char *node_name, - ibnd_node_t * node, uint8_t * pc, int portnum, - int *header_printed) +static int print_results(ib_portid_t * portid, char *node_name, +ibnd_node_t * node, uint8_t * pc, int portnum, +int *header_printed) { char buf[1024]; char *str = buf; @@ -311,11 +311,16 @@ static void print_results(ib_portid_t * portid, char *node_name, *header_printed = 1; } - printf( GUID 0x% PRIx64 port %d:%s\n, node-guid, - portnum, str); - if (port_config) + if (portnum == 0xFF) + printf( GUID 0x% PRIx64 port ALL:%s\n, + node-guid, str); + else + printf( GUID 0x% PRIx64 port %d:%s\n, + node-guid, portnum, str); + if (portnum != 0xFF port_config) print_port_config(node_name, node, portnum); } + return (n); } static int query_cap_mask(ib_portid_t * portid, char *node_name, int portnum, @@ -339,8 +344,8 @@ static int query_cap_mask(ib_portid_t * portid, char *node_name, int portnum, return 0; } -static void print_port(ib_portid_t * portid, uint16_t cap_mask, char *node_name, - ibnd_node_t * node, int portnum, int *header_printed) +static int print_port(ib_portid_t * portid, uint16_t cap_mask, char *node_name, + ibnd_node_t * node, int portnum, int *header_printed) { uint8_t pc[1024]; @@ -350,14 +355,15 @@ static void print_port(ib_portid_t * portid, uint16_t cap_mask, char *node_name, IB_GSI_PORT_COUNTERS, ibmad_port)) { IBWARN(IB_GSI_PORT_COUNTERS query failed on %s, %s port %d, node_name, portid2str(portid), portnum); - return; + return (0); } if (!(cap_mask 0x1000)) { /* if PortCounters:PortXmitWait not supported clear this counter */ uint32_t foo = 0; mad_encode_field(pc, IB_PC_XMT_WAIT_F, foo); } - print_results(portid, node_name, node, pc, portnum, header_printed); + return (print_results(portid, node_name, node, pc, portnum, + header_printed)); } static void clear_port(ib_portid_t * portid, uint16_t cap_mask, @@ -425,6 +431,27 @@ void print_node(ibnd_node_t * node, void *user_data) node_name = remap_node_name(node_name_map, node-guid, node-nodedesc); + if (node-type == IB_NODE_SWITCH) { + ib_portid_set(portid, node-smalid, 0, 0); + p = 0; + } else { + for (p = 1; p = node-numports; p++) { + if (node-ports[p]) { + ib_portid_set(portid, + node-ports[p]-base_lid, + 0, 0); + break; + } + } + } + if ((query_cap_mask(portid, node_name, p, cap_mask) == 0) + (cap_mask 0x100)) { + all_port_sup = 1; + if (!print_port(portid, cap_mask, node_name, node, + 0xFF, header_printed)) + goto clear; + } + for (p = startport; p = node-numports; p++) { if (node-ports[p]) { if (node-type == IB_NODE_SWITCH) @@ -433,13 +460,6 @@ void print_node(ibnd_node_t * node, void *user_data) ib_portid_set(portid, node-ports[p]-base_lid, 0, 0); - if (query_cap_mask(portid, node_name, p, cap_mask) - 0) - continue; - - if (cap_mask 0x100) - all_port_sup = 1; - print_port(portid, cap_mask, node_name, node, p,