[PATCH 0/1] tap: fix write-after-free and double free of intr_handle
Starting and stopping a tap device results in the following output: tap_lsc_intr_handle_set(): intr callback unregister failed: -2 free(): invalid pointer and a core dump is generated due to abort() being called. Although the stack backtrace below gives line numbers for dpdk-21.11, this problem still occurs with the current HEAD of the development tree (commit 7615ec581). The stack backtrace is: #0 0x7f80741d12a2 in raise () from /lib64/libc.so.6 #1 0x7f80741ba8a4 in abort () from /lib64/libc.so.6 #2 0x7f8074213ac7 in __libc_message () from /lib64/libc.so.6 #3 0x7f807421b73c in malloc_printerr () from /lib64/libc.so.6 #4 0x7f807421c97c in _int_free () from /lib64/libc.so.6 #5 0x7f80742207a8 in free () from /lib64/libc.so.6 #6 0x7f8074374bf5 in rte_intr_instance_free (intr_handle=intr_handle@entry=0x1003b2480) at ../lib/eal/common/eal_common_interrupts.c:184 #7 0x7f8072bf16ce in tap_rx_intr_vec_uninstall (dev=dev@entry=0x7f80744ed480 ) at ../drivers/net/tap/tap_intr.c:38 #8 0x7f8072bf18d3 in tap_rx_intr_vec_set (dev=dev@entry=0x7f80744ed480 , set=set@entry=0) at ../drivers/net/tap/tap_intr.c:111 #9 0x7f8072bea742 in tap_intr_handle_set (dev=dev@entry=0x7f80744ed480 , set=set@entry=0) at ../drivers/net/tap/rte_eth_tap.c:1727 #10 0x7f8072bea7d0 in tap_dev_stop (dev=0x7f80744ed480 ) at ../drivers/net/tap/rte_eth_tap.c:916 #11 0x7f80744875a4 in rte_eth_dev_stop (port_id=) at ../lib/ethdev/rte_ethdev.c:1883 #12 0x0040158b in main (argc=4, argv=0x7ffd13cfc368) at tap_free.c:59 A sample program to demonstrate the problem is === // Run as: build/tap_free --vdev=net_tap0,remote=PORT -l 0,1 #include #include #include #include int main(int argc, char *argv[]) { uint16_t port_id; struct rte_mempool *mbuf_pool; struct rte_eth_conf port_conf; struct rte_eth_dev_info dev_info; uint16_t nb_rxd = 1024; uint16_t nb_txd = 1024; struct rte_eth_txconf txconf; if (rte_eal_init(argc, argv) < 0) rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); if (rte_eth_dev_count_avail() < 1) rte_exit(EXIT_FAILURE, "Error: should have at least 1 port\n"); port_id = rte_eth_find_next_owned_by(0, RTE_ETH_DEV_NO_OWNER); mbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", 1023, 256, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); if (!rte_eth_dev_is_valid_port(port_id)) rte_exit(1, "a\n"); memset(&port_conf, 0, sizeof(struct rte_eth_conf)); if (rte_eth_dev_info_get(port_id, &dev_info)) rte_exit(1, "b\n"); if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) port_conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; if (rte_eth_dev_configure(port_id, 1, 1, &port_conf)) rte_exit(1, "c\n"); if (rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd)) rte_exit(1, "d\n"); if (rte_eth_rx_queue_setup(port_id, 0, nb_rxd, rte_eth_dev_socket_id(port_id), NULL, mbuf_pool) < 0) rte_exit(1, "e\n"); txconf = dev_info.default_txconf; txconf.offloads = port_conf.txmode.offloads; if (rte_eth_tx_queue_setup(port_id, 0, nb_txd, rte_eth_dev_socket_id(port_id), &txconf) < 0) rte_exit(1, "f\n"); if (rte_eth_dev_start(port_id) < 0) rte_exit(1, "g\n"); printf("Calling rte_eth_dev_stop - will error without patch\n"); fflush(stdout); rte_eth_dev_stop(port_id); printf("Returned from rte_eth_dev_stop\n"); fflush(stdout); rte_eth_dev_close(port_id); rte_eal_cleanup(); } === This problem is caused by tap_tx_intr_vec_uninstall() calling rte_intr_instance_free() which frees pmd->intr_handle (and it doesn't set pmd->intr_handle to NULL, although this is not the cause of the issue). When tap_rx_intr_vec_uninstall() is called from tap_rx_intr_vec_set() with the set parameter != 0, which occurs when rte_eth_dev_start() is called, it frees pmd->intr_handle, and tap_rx_intr_vec_install() is subsequently called. If intr_conf.rxq is not set, this does not cause an immediate problem, but if it is set, it will write to (the now) unallocated memory. The main problem occurs when tap_dev_stop() is called which in turn calls tap_intr_handle_set() and tap_lsc_intr_handle_set(). This uses pmd->intr_handle which has now been overwritten due to being previously freed. When rte_intr_instance_free() is called via tap_rx_intr_vec_uninstall(), due to intr_handle->alloc_flags having been overwritten by a subsequen
[PATCH 1/1] tap: fix write-after-free and double free of intr_handle
rte_pmd_tun/tap_probe() allocates pmd->intr_handle in eth_dev_tap_create() and it should not be freed until rte_pmd_tap_remove() is called. Inspection of tap_rx_intr_vec_set() shows that the call to tap_tx_intr_vec_uninstall() was calling rte_intr_instance_free() but tap_tx_intr_vec_install() can then be immediately called, and this then uses pmd->intr_handle without it being reallocated. This commit moves the call of rte_intr_instance_free() from tap_tx_intr_vec_uninstall() to rte_pmd_tap_remove(). Signed-off-by: Quentin Armitage --- drivers/net/tap/rte_eth_tap.c | 5 + drivers/net/tap/tap_intr.c| 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c index bc3d56a311..aab1692ebf 100644 --- a/drivers/net/tap/rte_eth_tap.c +++ b/drivers/net/tap/rte_eth_tap.c @@ -2612,14 +2612,19 @@ static int rte_pmd_tap_remove(struct rte_vdev_device *dev) { struct rte_eth_dev *eth_dev = NULL; + struct pmd_internals *pmd; + struct rte_intr_handle *intr_handle; /* find the ethdev entry */ eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); if (!eth_dev) return 0; + pmd = eth_dev->data->dev_private; + intr_handle = pmd->intr_handle; tap_dev_close(eth_dev); rte_eth_dev_release_port(eth_dev); + rte_intr_instance_free(intr_handle); return 0; } diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c index 56c343acea..a9097def1a 100644 --- a/drivers/net/tap/tap_intr.c +++ b/drivers/net/tap/tap_intr.c @@ -34,8 +34,6 @@ tap_rx_intr_vec_uninstall(struct rte_eth_dev *dev) rte_intr_free_epoll_fd(intr_handle); rte_intr_vec_list_free(intr_handle); rte_intr_nb_efd_set(intr_handle, 0); - - rte_intr_instance_free(intr_handle); } /** -- 2.34.1
[PATCH v2] tap: fix write-after-free and double free of intr_handle
rte_pmd_tun/tap_probe() allocates pmd->intr_handle in eth_dev_tap_create() and it should not be freed until rte_pmd_tap_remove() is called. Inspection of tap_rx_intr_vec_set() shows that the call to tap_tx_intr_vec_uninstall() was calling rte_intr_instance_free() but tap_tx_intr_vec_install() can then be immediately called, and this then uses pmd->intr_handle without it being reallocated. This commit moves the call of rte_intr_instance_free() from tap_tx_intr_vec_uninstall() to rte_pmd_tap_remove(). Fixes: d61138d4f0e2 ("drivers: remove direct access to interrupt handle") Changes in v2: Move rte_intr_instance_free() from tap_rx_intr_vec_uninstall() to tap_dev_close(). Signed-off-by: Quentin Armitage --- drivers/net/tap/rte_eth_tap.c | 2 ++ drivers/net/tap/tap_intr.c| 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c index bc3d56a311..5495818be6 100644 --- a/drivers/net/tap/rte_eth_tap.c +++ b/drivers/net/tap/rte_eth_tap.c @@ -1213,6 +1213,8 @@ tap_dev_close(struct rte_eth_dev *dev) TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u", tuntap_types[internals->type], rte_socket_id()); + rte_intr_instance_free(internals->intr_handle); + if (internals->ioctl_sock != -1) { close(internals->ioctl_sock); internals->ioctl_sock = -1; diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c index 56c343acea..a9097def1a 100644 --- a/drivers/net/tap/tap_intr.c +++ b/drivers/net/tap/tap_intr.c @@ -34,8 +34,6 @@ tap_rx_intr_vec_uninstall(struct rte_eth_dev *dev) rte_intr_free_epoll_fd(intr_handle); rte_intr_vec_list_free(intr_handle); rte_intr_nb_efd_set(intr_handle, 0); - - rte_intr_instance_free(intr_handle); } /** -- 2.34.1
[PATCH] libpcapng: fix timestamp wrapping in output files
In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 seconds after pcap_init is called when using a TSC with a frequency of 2.5GHz. To avoid the overflow, reread the time and TSC once delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there is no overflow if there is a several second gap between calls to pcapng_tsc_to_ns() the actual check to reread the clock is: delta > ((1ULL << 63) / NSEC_PER_SEC) Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") Cc: sta...@dpdk.org Signed-off-by: Quentin Armitage --- lib/pcapng/rte_pcapng.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c index 90b2f5bc69..7770be725f 100644 --- a/lib/pcapng/rte_pcapng.c +++ b/lib/pcapng/rte_pcapng.c @@ -34,7 +34,7 @@ struct rte_pcapng { }; /* For converting TSC cycles to PCAPNG ns format */ -struct pcapng_time { +static struct pcapng_time { uint64_t ns; uint64_t cycles; } pcapng_time; @@ -53,7 +53,21 @@ static uint64_t pcapng_tsc_to_ns(uint64_t cycles) { uint64_t delta; + /* With a TSC frequency of 2.5GHz, delta * NSEC_PER_SEC will +* wrap in under 8 seconds. Once half that time has elapsed +* reread the system clock and TSC to ensure wrapping does not +* occur. +*/ delta = cycles - pcapng_time.cycles; + if (delta > ((1ULL << 63) / NSEC_PER_SEC)) { + pcapng_init(); + if (cycles > pcapng_time.cycles) + delta = cycles - pcapng_time.cycles; + else { + delta = pcapng_time.cycles - cycles; + return pcapng_time.ns - (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); + } + } return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); } -- 2.34.1
Re: [PATCH] libpcapng: fix timestamp wrapping in output files
On Wed, 2022-05-11 at 09:46 -0700, Stephen Hemminger wrote: > On Sat, 7 May 2022 17:12:36 +0100 > Quentin Armitage wrote: > > > In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 > > seconds after pcap_init is called when using a TSC with a frequency > > of 2.5GHz. > > > > To avoid the overflow, reread the time and TSC once > > delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there > > is no overflow if there is a several second gap between calls to > > pcapng_tsc_to_ns() the actual check to reread the clock is: > > delta > ((1ULL << 63) / NSEC_PER_SEC) > > > > Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") > > Cc: sta...@dpdk.org > > > > Signed-off-by: Quentin Armitage > > What about something like this instead. > > diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c > index 90b2f5bc6905..c5534301bf2c 100644 > --- a/lib/pcapng/rte_pcapng.c > +++ b/lib/pcapng/rte_pcapng.c > @@ -19,6 +19,7 @@ > #include > #include > #include > +#include > #include > > #include "pcapng_proto.h" > @@ -34,27 +35,39 @@ struct rte_pcapng { > }; > > /* For converting TSC cycles to PCAPNG ns format */ > -struct pcapng_time { > +#define TICK_SCALE 16u > +static struct { > uint64_t ns; > uint64_t cycles; > + struct rte_reciprocal_u64 inverse; > } pcapng_time; > > RTE_INIT(pcapng_init) > { > struct timespec ts; > + uint64_t scale_tick_per_ns; > > pcapng_time.cycles = rte_get_tsc_cycles(); > clock_gettime(CLOCK_REALTIME, &ts); > pcapng_time.ns = rte_timespec_to_ns(&ts); > + > + scale_tick_per_ns = (rte_get_tsc_hz() * TICK_SCALE) / NSEC_PER_SEC; > + pcapng_time.inverse = rte_reciprocal_value_u64(scale_tick_per_ns); > } > > /* PCAPNG timestamps are in nanoseconds */ > static uint64_t pcapng_tsc_to_ns(uint64_t cycles) > { > - uint64_t delta; > + uint64_t delta, elapsed; > > delta = cycles - pcapng_time.cycles; > - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); > + > + /* Compute elapsed time in nanoseconds scaled by TICK_SCALE > + * since the start of the capture. > + * With scale of 4 this will roll over in 36 years. > + */ > + elapsed = rte_reciprocal_divide_u64(delta, &pcapng_time.inverse); > + return pcapng_time.ns + elapsed / TICK_SCALE; > } > > /* length of option including padding */ > The final statement of pcapng_tsc_to_ns() should be: return pcapng_time.ns + elapsed * TICK_SCALE; There is also a problem that rte_get_tsc_hz() returns eal_tsc_resolution_hz, but this is not initialized until rte_eal_init() is called, so rte_get_tsc_hz() cannot be called from a constructor function. While both of the above problems can easily be solved, I think there is a problem with accuracy with this approach. With a 3GHz clock, scale_tick_per_ns would be 48. For other clock speeds there can be a truncation in the calculation. With a 3.3GHz clock, scale_tick_per_ns will be truncated from 52.8 to 52, resulting in a 1.5% or so error in the time returned by pcapng_tsc_to_ns() (a 2.3GHz clock results in a 2.2% error). Increasing TICK_SCALE reduces the %age error, but also reduces the time before overflow occurs. If the approach in the following patch is considered to be acceptable, I would be very happy to submit an updated patch. The one concern I have about the patch is introducing a new constructor priority, RTE_PRIORITY_TIMER, which may be considered to be inappropriate. If it is inappropriate, then the simplest alternative would be to introduce a new function rte_tsc_get_hz_init() which calls set_tsc_freq() if eal_tsc_resolution_hz has not been initialized (alternatively rte_get_tsc_hz() could be modified to make the check, but that then produces an overhead every time the function is called). diff --git a/lib/eal/common/eal_common_timer.c b/lib/eal/common/eal_common_timer.c index 5686a5102b..cb3fa1e240 100644 --- a/lib/eal/common/eal_common_timer.c +++ b/lib/eal/common/eal_common_timer.c @@ -54,6 +54,9 @@ set_tsc_freq(void) struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; uint64_t freq; + if (eal_tsc_resolution_hz) + return; + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { /* * Just use the primary process calculated TSC rate in any @@ -86,3 +89,8 @@ RTE_INIT(rte_timer_init) /* set rte_delay_us_block as a delay function */ rte_delay_us_callback_register(rte_delay_us_block); } + +RTE_INIT_PRIO(rte_tsc_init, TIMER) +{ + se
[PATCH v2] libpcapng: fix timestamp wrapping in output files
In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 seconds after pcap_init is called when using a TSC with a frequency of 2.5GHz. To avoid the overflow, update the saved time and TSC value once delta >= tsc_hz. Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") Cc: sta...@dpdk.org Signed-off-by: Quentin Armitage --- v2: - Don't call clock_gettime() in fast path - Update pcapng_time.ns and pcapng_time.cycles to ensure delta < tsc_hz - Stop using constructor to initialise pcapng_time.tsc_hz since it is not initialised until rte_eal_init() is called - use mean value of TSC before and after call to clock_gettime() - only call rte_get_tsc_hz() once - use rte_reciprocal functions instead of division lib/pcapng/rte_pcapng.c | 47 - 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c index 90b2f5bc69..06ad712bd1 100644 --- a/lib/pcapng/rte_pcapng.c +++ b/lib/pcapng/rte_pcapng.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "pcapng_proto.h" @@ -34,27 +35,63 @@ struct rte_pcapng { }; /* For converting TSC cycles to PCAPNG ns format */ -struct pcapng_time { +static struct pcapng_time { uint64_t ns; uint64_t cycles; + uint64_t tsc_hz; + struct rte_reciprocal_u64 tsc_hz_inverse; } pcapng_time; -RTE_INIT(pcapng_init) +static inline void +pcapng_init(void) { struct timespec ts; pcapng_time.cycles = rte_get_tsc_cycles(); clock_gettime(CLOCK_REALTIME, &ts); + pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2; pcapng_time.ns = rte_timespec_to_ns(&ts); + + pcapng_time.tsc_hz = rte_get_tsc_hz(); + pcapng_time.tsc_hz_inverse = rte_reciprocal_value_u64(pcapng_time.tsc_hz); } /* PCAPNG timestamps are in nanoseconds */ static uint64_t pcapng_tsc_to_ns(uint64_t cycles) { - uint64_t delta; - + uint64_t delta, secs; + + if (!pcapng_time.tsc_hz) + pcapng_init(); + + /* In essence the calculation is: +* delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / rte_get_tsc_hz() +* but this overflows within 4 to 8 seconds depending on TSC frequency. +* Instead, if delta >= pcapng_time.tsc_hz: +* Increase pcapng_time.ns and pcapng_time.cycles by the number of +* whole seconds in delta and reduce delta accordingly. +* delta will therefore always lie in the interval [0, pcapng_time.tsc_hz), +* which will not overflow when multiplied by NSEC_PER_SEC provided the +* TSC frequency < approx 18.4GHz. +* +* Currently all TSCs operate below 5GHz. +*/ delta = cycles - pcapng_time.cycles; - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); + if (unlikely(delta >= pcapng_time.tsc_hz)) { + if (likely(delta < pcapng_time.tsc_hz * 2)) { + delta -= pcapng_time.tsc_hz; + pcapng_time.cycles += pcapng_time.tsc_hz; + pcapng_time.ns += NSEC_PER_SEC; + } else { + secs = rte_reciprocal_divide_u64(delta, &pcapng_time.tsc_hz_inverse); + delta -= secs * pcapng_time.tsc_hz; + pcapng_time.cycles += secs * pcapng_time.tsc_hz; + pcapng_time.ns += secs * NSEC_PER_SEC; + } + } + + return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC, + &pcapng_time.tsc_hz_inverse); } /* length of option including padding */ -- 2.34.3