On Tue, Mar 14, 2023 at 10:46:02AM -0400, Aaron Conole wrote: > Flavio Leitner <f...@sysclose.org> writes: > > > On Wed, Mar 08, 2023 at 05:37:11PM -0500, Aaron Conole wrote: > >> Open vSwitch generally tries to let the underlying operating system > >> managed the low level details of hardware, for example DMA mapping, > >> bus arbitration, etc. However, when using DPDK, the underlying > >> operating system yields control of many of these details to userspace > >> for management. > >> > >> In the case of some DPDK port drivers, configuring rte_flow or even > >> allocating resources may require access to iopl/ioperm calls, which > >> are guarded by the CAP_SYS_RAWIO privilege on linux systems. These > >> calls are dangerous, and can allow a process to completely compromise > >> a system. However, they are needed in the case of some userspace > >> driver code which manages the hardware (for example, the mlx > >> implementation of backend support for rte_flow). > >> > >> Here, we create an opt-in flag passed to the command line to allow > >> this access. We need to do this before ever accessing the database, > >> because we want to drop all privileges asap, and cannot wait for > >> a connection to the database to be established and functional before > >> dropping. There may be distribution specific ways to do capability > >> management as well (using for example, systemd), but they are not > >> as universal to the vswitchd as a flag. > >> > >> Reviewed-by: Simon Horman <simon.hor...@corigine.com> > >> Signed-off-by: Aaron Conole <acon...@redhat.com> > >> --- > >> v1->v2: update daemon-windows for daemon_become_new_user > >> > >> v2->v3: update daemon-windows for daemon_start > >> change log messages to be clearer > >> update the manpage to provide example of why > >> one would want the flag > >> > >> NEWS | 4 ++++ > >> lib/daemon-unix.c | 29 +++++++++++++++++++++-------- > >> lib/daemon-windows.c | 6 ++++-- > >> lib/daemon.c | 2 +- > >> lib/daemon.h | 4 ++-- > >> ovsdb/ovsdb-client.c | 6 +++--- > >> ovsdb/ovsdb-server.c | 4 ++-- > >> tests/test-netflow.c | 2 +- > >> tests/test-sflow.c | 2 +- > >> tests/test-unixctl.c | 2 +- > >> utilities/ovs-ofctl.c | 4 ++-- > >> utilities/ovs-testcontroller.c | 4 ++-- > >> vswitchd/ovs-vswitchd.8.in | 9 +++++++++ > >> vswitchd/ovs-vswitchd.c | 11 ++++++++++- > >> 14 files changed, 63 insertions(+), 26 deletions(-) > >> > >> diff --git a/NEWS b/NEWS > >> index 85b3496214..65f35dcdd5 100644 > >> --- a/NEWS > >> +++ b/NEWS > >> @@ -10,6 +10,10 @@ Post-v3.1.0 > >> in order to create OVSDB sockets with access mode of 0770. > >> - QoS: > >> * Added new configuration option 'jitter' for a linux-netem QoS type. > >> + - DPDK: > >> + * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started > >> + with the --hw-rawio-access command line option. This allows the > >> + process extra privileges when mapping physical interconnect memory. > >> > >> > >> v3.1.0 - 16 Feb 2023 > >> diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c > >> index 1a7ba427d7..dd839015ab 100644 > >> --- a/lib/daemon-unix.c > >> +++ b/lib/daemon-unix.c > >> @@ -88,7 +88,8 @@ static bool switch_user = false; > >> static uid_t uid; > >> static gid_t gid; > >> static char *user = NULL; > >> -static void daemon_become_new_user__(bool access_datapath); > >> +static void daemon_become_new_user__(bool access_datapath, > >> + bool access_hardware_ports); > >> > >> static void check_already_running(void); > >> static int lock_pidfile(FILE *, int command); > >> @@ -443,13 +444,13 @@ monitor_daemon(pid_t daemon_pid) > >> * daemonize_complete()) or that it failed to start up (by exiting with a > >> * nonzero exit code). */ > >> void > >> -daemonize_start(bool access_datapath) > >> +daemonize_start(bool access_datapath, bool access_hardware_ports) > >> { > >> assert_single_threaded(); > >> daemonize_fd = -1; > >> > >> if (switch_user) { > >> - daemon_become_new_user__(access_datapath); > >> + daemon_become_new_user__(access_datapath, access_hardware_ports); > >> switch_user = false; > >> } > >> > >> @@ -807,7 +808,8 @@ daemon_become_new_user_unix(void) > >> /* Linux specific implementation of daemon_become_new_user() > >> * using libcap-ng. */ > >> static void > >> -daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) > >> +daemon_become_new_user_linux(bool access_datapath OVS_UNUSED, > >> + bool access_hardware_ports OVS_UNUSED) > >> { > >> #if defined __linux__ && HAVE_LIBCAPNG > >> int ret; > >> @@ -827,6 +829,16 @@ daemon_become_new_user_linux(bool access_datapath > >> OVS_UNUSED) > >> ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN) > >> || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW) > >> || capng_update(CAPNG_ADD, cap_sets, > >> CAP_NET_BROADCAST); > >> +#ifdef DPDK_NETDEV > >> + if (access_hardware_ports && !ret) { > >> + ret = capng_update(CAPNG_ADD, cap_sets, > >> CAP_SYS_RAWIO); > >> + VLOG_INFO("The Linux capability CAP_SYS_RAWIO > >> enabled."); > > > > Shouldn't it be ... CAP_SYS_RAWIO is enabled. ? > > I can add the 'is' - but FYI, I copied all the log messages that were > suggested in previous emails.
The 'is' seems to be in the original suggestion. > > > >> + } > >> +#else > >> + if (access_hardware_ports) { > >> + VLOG_WARN("No driver requires Linux capability CAP_SYS_RAWIO, > >> disabling it."); > > > > Although it is not my personal preference, log messages respects > > the 79 characters limit. There is an example in this same file at > > line #405: "waiting until 10 seconds since last..." > > Okay, I can spin a v4 > > > Works for me. This is how I tested: > > # /usr/local/share/openvswitch/scripts/ovs-ctl \ > > --ovs-user='openvswitch:hugetlbfs' \ > > --ovs-vswitchd-options='--hw-rawio-access' start > > > > # getpcaps $(pidof ovs-vswitchd) > > 26923: > > cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_sys_rawio=ep > > 26922: > > cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_sys_rawio=ep > > > > > > # /usr/local/share/openvswitch/scripts/ovs-ctl stop > > Exiting ovs-vswitchd (26923). > > Exiting ovsdb-server (26910). > > > > > > # /usr/local/share/openvswitch/scripts/ovs-ctl \ > > --ovs-user='openvswitch:hugetlbfs' start > > # getpcaps $(pidof ovs-vswitchd) > > 27010: > > cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock=ep > > 27009: > > cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock=ep > > Thanks for showing the result. > > I didn't want to make a unit test for this because I wasn't sure > if there would ever be a reliable way to test for it (ie: we don't know > that the testing environment gets run as root, with those capabilities > available, etc). Also, I used 'capsh --decode' and didn't use getpcaps > before - neat! It requires root and at least a temporary user, and then I am not sure SELinux will get in the way, yeah... fbl > > > Thanks, > > fbl > > > > > >> + } > >> +#endif > >> } > >> } else { > >> ret = -1; > >> @@ -854,7 +866,7 @@ daemon_become_new_user_linux(bool access_datapath > >> OVS_UNUSED) > >> } > >> > >> static void > >> -daemon_become_new_user__(bool access_datapath) > >> +daemon_become_new_user__(bool access_datapath, bool access_hardware_ports) > >> { > >> /* If vlog file has been created, change its owner to the non-root > >> user > >> * as specifed by the --user option. */ > >> @@ -862,7 +874,8 @@ daemon_become_new_user__(bool access_datapath) > >> > >> if (LINUX) { > >> if (LIBCAPNG) { > >> - daemon_become_new_user_linux(access_datapath); > >> + daemon_become_new_user_linux(access_datapath, > >> + access_hardware_ports); > >> } else { > >> VLOG_FATAL("%s: fail to downgrade user using libcap-ng. " > >> "(libcap-ng is not configured at compile time), " > >> @@ -877,11 +890,11 @@ daemon_become_new_user__(bool access_datapath) > >> * However, there in case the user switch needs to be done > >> * before daemonize_start(), the following API can be used. */ > >> void > >> -daemon_become_new_user(bool access_datapath) > >> +daemon_become_new_user(bool access_datapath, bool access_hardware_ports) > >> { > >> assert_single_threaded(); > >> if (switch_user) { > >> - daemon_become_new_user__(access_datapath); > >> + daemon_become_new_user__(access_datapath, access_hardware_ports); > >> /* daemonize_start() should not switch user again. */ > >> switch_user = false; > >> } > >> diff --git a/lib/daemon-windows.c b/lib/daemon-windows.c > >> index 7e5f264f5b..4e6bbe0f04 100644 > >> --- a/lib/daemon-windows.c > >> +++ b/lib/daemon-windows.c > >> @@ -498,7 +498,8 @@ make_pidfile(void) > >> } > >> > >> void > >> -daemonize_start(bool access_datapath OVS_UNUSED) > >> +daemonize_start(bool access_datapath OVS_UNUSED, > >> + bool access_hardware_ports OVS_UNUSED) > >> { > >> if (pidfile) { > >> make_pidfile(); > >> @@ -526,7 +527,8 @@ daemonize_complete(void) > >> } > >> > >> void > >> -daemon_become_new_user(bool access_datapath OVS_UNUSED) > >> +daemon_become_new_user(bool access_datapath OVS_UNUSED, > >> + bool access_hardware_ports OVS_UNUSED) > >> { > >> } > >> > >> diff --git a/lib/daemon.c b/lib/daemon.c > >> index 3249c5ab4b..1e1c019eb1 100644 > >> --- a/lib/daemon.c > >> +++ b/lib/daemon.c > >> @@ -48,7 +48,7 @@ get_detach(void) > >> void > >> daemonize(void) > >> { > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> daemonize_complete(); > >> } > >> > >> diff --git a/lib/daemon.h b/lib/daemon.h > >> index 0941574963..42372d1463 100644 > >> --- a/lib/daemon.h > >> +++ b/lib/daemon.h > >> @@ -167,10 +167,10 @@ void set_detach(void); > >> bool get_detach(void); > >> void daemon_save_fd(int fd); > >> void daemonize(void); > >> -void daemonize_start(bool access_datapath); > >> +void daemonize_start(bool access_datapath, bool access_hardware_ports); > >> void daemonize_complete(void); > >> void daemon_set_new_user(const char * user_spec); > >> -void daemon_become_new_user(bool access_datapath); > >> +void daemon_become_new_user(bool access_datapath, bool > >> access_hardware_ports); > >> void daemon_usage(void); > >> void daemon_disable_self_confinement(void); > >> bool daemon_should_self_confine(void); > >> diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c > >> index f1b8d64910..bae2c5f041 100644 > >> --- a/ovsdb/ovsdb-client.c > >> +++ b/ovsdb/ovsdb-client.c > >> @@ -250,7 +250,7 @@ main(int argc, char *argv[]) > >> parse_options(argc, argv); > >> fatal_ignore_sigpipe(); > >> > >> - daemon_become_new_user(false); > >> + daemon_become_new_user(false, false); > >> if (optind >= argc) { > >> ovs_fatal(0, "missing command name; use --help for help"); > >> } > >> @@ -1392,7 +1392,7 @@ do_monitor__(struct jsonrpc *rpc, const char > >> *database, > >> > >> daemon_save_fd(STDOUT_FILENO); > >> daemon_save_fd(STDERR_FILENO); > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> if (get_detach()) { > >> int error; > >> > >> @@ -2276,7 +2276,7 @@ do_lock(struct jsonrpc *rpc, const char *method, > >> const char *lock) > >> getting a reply of the previous > >> request. */ > >> daemon_save_fd(STDOUT_FILENO); > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> lock_req_init(&lock_req, method, lock); > >> > >> if (get_detach()) { > >> diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c > >> index 33ca4910d7..4fea2dbda7 100644 > >> --- a/ovsdb/ovsdb-server.c > >> +++ b/ovsdb/ovsdb-server.c > >> @@ -341,7 +341,7 @@ main(int argc, char *argv[]) > >> &run_command, &sync_from, &sync_exclude, &active); > >> is_backup = sync_from && !active; > >> > >> - daemon_become_new_user(false); > >> + daemon_become_new_user(false, false); > >> > >> /* Create and initialize 'config_tmpfile' as a temporary file to hold > >> * ovsdb-server's most basic configuration, and then save our initial > >> @@ -359,7 +359,7 @@ main(int argc, char *argv[]) > >> save_config__(config_tmpfile, &remotes, &db_filenames, sync_from, > >> sync_exclude, is_backup); > >> > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> > >> /* Load the saved config. */ > >> load_config(config_tmpfile, &remotes, &db_filenames, &sync_from, > >> diff --git a/tests/test-netflow.c b/tests/test-netflow.c > >> index d2322d4509..7f89cfcae0 100644 > >> --- a/tests/test-netflow.c > >> +++ b/tests/test-netflow.c > >> @@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[]) > >> } > >> > >> daemon_save_fd(STDOUT_FILENO); > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> > >> error = unixctl_server_create(NULL, &server); > >> if (error) { > >> diff --git a/tests/test-sflow.c b/tests/test-sflow.c > >> index 460d4d6c54..3c617bdd16 100644 > >> --- a/tests/test-sflow.c > >> +++ b/tests/test-sflow.c > >> @@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[]) > >> } > >> > >> daemon_save_fd(STDOUT_FILENO); > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> > >> error = unixctl_server_create(NULL, &server); > >> if (error) { > >> diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c > >> index 3eadf54cd9..9e89827895 100644 > >> --- a/tests/test-unixctl.c > >> +++ b/tests/test-unixctl.c > >> @@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[]) > >> fatal_ignore_sigpipe(); > >> parse_options(&argc, &argv, &unixctl_path); > >> > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> int retval = unixctl_server_create(unixctl_path, &unixctl); > >> if (retval) { > >> exit(EXIT_FAILURE); > >> diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c > >> index eabec18a36..f81f5f759a 100644 > >> --- a/utilities/ovs-ofctl.c > >> +++ b/utilities/ovs-ofctl.c > >> @@ -173,7 +173,7 @@ main(int argc, char *argv[]) > >> ctx.argc = argc - optind; > >> ctx.argv = argv + optind; > >> > >> - daemon_become_new_user(false); > >> + daemon_become_new_user(false, false); > >> if (read_only) { > >> ovs_cmdl_run_command_read_only(&ctx, get_all_commands()); > >> } else { > >> @@ -2127,7 +2127,7 @@ monitor_vconn(struct vconn *vconn, bool > >> reply_to_echo_requests, > >> int error; > >> > >> daemon_save_fd(STDERR_FILENO); > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> error = unixctl_server_create(unixctl_path, &server); > >> if (error) { > >> ovs_fatal(error, "failed to create unixctl server"); > >> diff --git a/utilities/ovs-testcontroller.c > >> b/utilities/ovs-testcontroller.c > >> index b489ff5fc7..9f2fbfdf51 100644 > >> --- a/utilities/ovs-testcontroller.c > >> +++ b/utilities/ovs-testcontroller.c > >> @@ -109,7 +109,7 @@ main(int argc, char *argv[]) > >> parse_options(argc, argv); > >> fatal_ignore_sigpipe(); > >> > >> - daemon_become_new_user(false); > >> + daemon_become_new_user(false, false); > >> > >> if (argc - optind < 1) { > >> ovs_fatal(0, "at least one vconn argument required; " > >> @@ -148,7 +148,7 @@ main(int argc, char *argv[]) > >> ovs_fatal(0, "no active or passive switch connections"); > >> } > >> > >> - daemonize_start(false); > >> + daemonize_start(false, false); > >> > >> retval = unixctl_server_create(unixctl_path, &unixctl); > >> if (retval) { > >> diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in > >> index 9569265fcb..10c6e077ba 100644 > >> --- a/vswitchd/ovs-vswitchd.8.in > >> +++ b/vswitchd/ovs-vswitchd.8.in > >> @@ -81,6 +81,15 @@ unavailable or unsuccessful. > >> .SS "DPDK Options" > >> For details on initializing \fBovs\-vswitchd\fR to use DPDK ports, > >> refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5). > >> +.SS "DPDK HW Access Options" > >> +.IP "\fB\-\-hw\-rawio\-access\fR" > >> +Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability, > >> +to allow userspace drivers access to raw hardware memory. This will > >> +also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and > >> +\fBioperm()\fR functions as well as access memory devices to set port > >> +access. This is a \fBvery\fR powerful capability, so generally only > >> +enable as needed for specific hardware (for example mlx5 with full > >> +hardware offload via rte_flow). > >> .SS "Daemon Options" > >> .ds DD \ > >> \fBovs\-vswitchd\fR detaches only after it has connected to the \ > >> diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c > >> index 407bfc60eb..a244d2f709 100644 > >> --- a/vswitchd/ovs-vswitchd.c > >> +++ b/vswitchd/ovs-vswitchd.c > >> @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd); > >> * the kernel from paging any of its memory to disk. */ > >> static bool want_mlockall; > >> > >> +/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */ > >> +static bool hw_rawio_access; > >> + > >> static unixctl_cb_func ovs_vswitchd_exit; > >> > >> static char *parse_options(int argc, char *argv[], char **unixctl_path); > >> @@ -89,7 +92,7 @@ main(int argc, char *argv[]) > >> remote = parse_options(argc, argv, &unixctl_path); > >> fatal_ignore_sigpipe(); > >> > >> - daemonize_start(true); > >> + daemonize_start(true, hw_rawio_access); > >> > >> if (want_mlockall) { > >> #ifdef HAVE_MLOCKALL > >> @@ -169,6 +172,7 @@ parse_options(int argc, char *argv[], char > >> **unixctl_pathp) > >> OPT_DPDK, > >> SSL_OPTION_ENUMS, > >> OPT_DUMMY_NUMA, > >> + OPT_HW_RAWIO_ACCESS, > >> }; > >> static const struct option long_options[] = { > >> {"help", no_argument, NULL, 'h'}, > >> @@ -185,6 +189,7 @@ parse_options(int argc, char *argv[], char > >> **unixctl_pathp) > >> {"disable-system-route", no_argument, NULL, > >> OPT_DISABLE_SYSTEM_ROUTE}, > >> {"dpdk", optional_argument, NULL, OPT_DPDK}, > >> {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA}, > >> + {"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS}, > >> {NULL, 0, NULL, 0}, > >> }; > >> char *short_options = > >> ovs_cmdl_long_options_to_short_options(long_options); > >> @@ -249,6 +254,10 @@ parse_options(int argc, char *argv[], char > >> **unixctl_pathp) > >> ovs_numa_set_dummy(optarg); > >> break; > >> > >> + case OPT_HW_RAWIO_ACCESS: > >> + hw_rawio_access = true; > >> + break; > >> + > >> default: > >> abort(); > >> } > >> -- > >> 2.39.2 > >> > >> _______________________________________________ > >> dev mailing list > >> d...@openvswitch.org > >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev -- fbl _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev