[PATCH 0/1] tap: fix write-after-free and double free of intr_handle

2022-05-02 Thread Quentin Armitage
Starting and stopping a tap device results in the following output:
  tap_lsc_intr_handle_set(): intr callback unregister failed: -2
  free(): invalid pointer
and a core dump is generated due to abort() being called.

Although the stack backtrace below gives line numbers for dpdk-21.11, this 
problem
still occurs with the current HEAD of the development tree (commit 7615ec581).

The stack backtrace is:

#0  0x7f80741d12a2 in raise () from /lib64/libc.so.6
#1  0x7f80741ba8a4 in abort () from /lib64/libc.so.6
#2  0x7f8074213ac7 in __libc_message () from /lib64/libc.so.6
#3  0x7f807421b73c in malloc_printerr () from /lib64/libc.so.6
#4  0x7f807421c97c in _int_free () from /lib64/libc.so.6
#5  0x7f80742207a8 in free () from /lib64/libc.so.6
#6  0x7f8074374bf5 in rte_intr_instance_free 
(intr_handle=intr_handle@entry=0x1003b2480) at 
../lib/eal/common/eal_common_interrupts.c:184
#7  0x7f8072bf16ce in tap_rx_intr_vec_uninstall 
(dev=dev@entry=0x7f80744ed480 ) at 
../drivers/net/tap/tap_intr.c:38
#8  0x7f8072bf18d3 in tap_rx_intr_vec_set (dev=dev@entry=0x7f80744ed480 
, set=set@entry=0) at ../drivers/net/tap/tap_intr.c:111
#9  0x7f8072bea742 in tap_intr_handle_set (dev=dev@entry=0x7f80744ed480 
, set=set@entry=0) at ../drivers/net/tap/rte_eth_tap.c:1727
#10 0x7f8072bea7d0 in tap_dev_stop (dev=0x7f80744ed480 ) 
at ../drivers/net/tap/rte_eth_tap.c:916
#11 0x7f80744875a4 in rte_eth_dev_stop (port_id=) at 
../lib/ethdev/rte_ethdev.c:1883
#12 0x0040158b in main (argc=4, argv=0x7ffd13cfc368) at tap_free.c:59

A sample program to demonstrate the problem is
===
// Run as: build/tap_free --vdev=net_tap0,remote=PORT -l 0,1

#include 

#include 
#include 
#include 

int
main(int argc, char *argv[])
{
uint16_t port_id;
struct rte_mempool *mbuf_pool;
struct rte_eth_conf port_conf;
struct rte_eth_dev_info dev_info;
uint16_t nb_rxd = 1024;
uint16_t nb_txd = 1024;
struct rte_eth_txconf txconf;

if (rte_eal_init(argc, argv) < 0)
rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");

if (rte_eth_dev_count_avail() < 1)
rte_exit(EXIT_FAILURE, "Error: should have at least 1 port\n");

port_id = rte_eth_find_next_owned_by(0, RTE_ETH_DEV_NO_OWNER);

mbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", 1023, 256, 0, 
RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());

if (!rte_eth_dev_is_valid_port(port_id))
rte_exit(1, "a\n");

memset(&port_conf, 0, sizeof(struct rte_eth_conf));

if (rte_eth_dev_info_get(port_id, &dev_info))
rte_exit(1, "b\n");

if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
port_conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;

if (rte_eth_dev_configure(port_id, 1, 1, &port_conf))
rte_exit(1, "c\n");

if (rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd))
rte_exit(1, "d\n");

if (rte_eth_rx_queue_setup(port_id, 0, nb_rxd, 
rte_eth_dev_socket_id(port_id), NULL, mbuf_pool) < 0)
rte_exit(1, "e\n");

txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
if (rte_eth_tx_queue_setup(port_id, 0, nb_txd, 
rte_eth_dev_socket_id(port_id), &txconf) < 0)
rte_exit(1, "f\n");

if (rte_eth_dev_start(port_id) < 0)
rte_exit(1, "g\n");

printf("Calling rte_eth_dev_stop - will error without patch\n"); 
fflush(stdout);
rte_eth_dev_stop(port_id);
printf("Returned from rte_eth_dev_stop\n"); fflush(stdout);
rte_eth_dev_close(port_id);

rte_eal_cleanup();
}
===

This problem is caused by tap_tx_intr_vec_uninstall() calling
rte_intr_instance_free() which frees pmd->intr_handle (and it doesn't
set pmd->intr_handle to NULL, although this is not the cause of the issue).

When tap_rx_intr_vec_uninstall() is called from tap_rx_intr_vec_set() with
the set parameter != 0, which occurs when rte_eth_dev_start() is called,
it frees pmd->intr_handle, and tap_rx_intr_vec_install() is subsequently
called. If intr_conf.rxq is not set, this does not cause an immediate
problem, but if it is set, it will write to (the now) unallocated memory.

The main problem occurs when tap_dev_stop() is called which in turn calls
tap_intr_handle_set() and tap_lsc_intr_handle_set(). This uses
pmd->intr_handle which has now been overwritten due to being previously
freed. When rte_intr_instance_free() is called via tap_rx_intr_vec_uninstall(),
due to intr_handle->alloc_flags having been overwritten by a subsequen

[PATCH 1/1] tap: fix write-after-free and double free of intr_handle

2022-05-02 Thread Quentin Armitage
rte_pmd_tun/tap_probe() allocates pmd->intr_handle in
eth_dev_tap_create() and it should not be freed until
rte_pmd_tap_remove() is called.

Inspection of tap_rx_intr_vec_set() shows that the call to
tap_tx_intr_vec_uninstall() was calling rte_intr_instance_free() but
tap_tx_intr_vec_install() can then be immediately called, and this then
uses pmd->intr_handle without it being reallocated.

This commit moves the call of rte_intr_instance_free() from
tap_tx_intr_vec_uninstall() to rte_pmd_tap_remove().

Signed-off-by: Quentin Armitage 
---
 drivers/net/tap/rte_eth_tap.c | 5 +
 drivers/net/tap/tap_intr.c| 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index bc3d56a311..aab1692ebf 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -2612,14 +2612,19 @@ static int
 rte_pmd_tap_remove(struct rte_vdev_device *dev)
 {
struct rte_eth_dev *eth_dev = NULL;
+   struct pmd_internals *pmd;
+   struct rte_intr_handle *intr_handle;
 
/* find the ethdev entry */
eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
if (!eth_dev)
return 0;
 
+   pmd = eth_dev->data->dev_private;
+   intr_handle = pmd->intr_handle;
tap_dev_close(eth_dev);
rte_eth_dev_release_port(eth_dev);
+   rte_intr_instance_free(intr_handle);
 
return 0;
 }
diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c
index 56c343acea..a9097def1a 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -34,8 +34,6 @@ tap_rx_intr_vec_uninstall(struct rte_eth_dev *dev)
rte_intr_free_epoll_fd(intr_handle);
rte_intr_vec_list_free(intr_handle);
rte_intr_nb_efd_set(intr_handle, 0);
-
-   rte_intr_instance_free(intr_handle);
 }
 
 /**
-- 
2.34.1



[PATCH v2] tap: fix write-after-free and double free of intr_handle

2022-05-03 Thread Quentin Armitage
rte_pmd_tun/tap_probe() allocates pmd->intr_handle in eth_dev_tap_create()
and it should not be freed until rte_pmd_tap_remove() is called.

Inspection of tap_rx_intr_vec_set() shows that the call to
tap_tx_intr_vec_uninstall() was calling rte_intr_instance_free() but
tap_tx_intr_vec_install() can then be immediately called, and this then
uses pmd->intr_handle without it being reallocated.

This commit moves the call of rte_intr_instance_free() from
tap_tx_intr_vec_uninstall() to rte_pmd_tap_remove().

Fixes: d61138d4f0e2 ("drivers: remove direct access to interrupt handle")

Changes in v2:
  Move rte_intr_instance_free() from tap_rx_intr_vec_uninstall()
  to tap_dev_close().

Signed-off-by: Quentin Armitage 
---
 drivers/net/tap/rte_eth_tap.c | 2 ++
 drivers/net/tap/tap_intr.c| 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index bc3d56a311..5495818be6 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -1213,6 +1213,8 @@ tap_dev_close(struct rte_eth_dev *dev)
TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
tuntap_types[internals->type], rte_socket_id());
 
+   rte_intr_instance_free(internals->intr_handle);
+
if (internals->ioctl_sock != -1) {
close(internals->ioctl_sock);
internals->ioctl_sock = -1;
diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c
index 56c343acea..a9097def1a 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -34,8 +34,6 @@ tap_rx_intr_vec_uninstall(struct rte_eth_dev *dev)
rte_intr_free_epoll_fd(intr_handle);
rte_intr_vec_list_free(intr_handle);
rte_intr_nb_efd_set(intr_handle, 0);
-
-   rte_intr_instance_free(intr_handle);
 }
 
 /**
-- 
2.34.1



[PATCH] libpcapng: fix timestamp wrapping in output files

2022-05-07 Thread Quentin Armitage
In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
seconds after pcap_init is called when using a TSC with a frequency
of 2.5GHz.

To avoid the overflow, reread the time and TSC once
delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there
is no overflow if there is a several second gap between calls to
pcapng_tsc_to_ns() the actual check to reread the clock is:
  delta > ((1ULL << 63) / NSEC_PER_SEC)

Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
Cc: sta...@dpdk.org

Signed-off-by: Quentin Armitage 
---
 lib/pcapng/rte_pcapng.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
index 90b2f5bc69..7770be725f 100644
--- a/lib/pcapng/rte_pcapng.c
+++ b/lib/pcapng/rte_pcapng.c
@@ -34,7 +34,7 @@ struct rte_pcapng {
 };
 
 /* For converting TSC cycles to PCAPNG ns format */
-struct pcapng_time {
+static struct pcapng_time {
uint64_t ns;
uint64_t cycles;
 } pcapng_time;
@@ -53,7 +53,21 @@ static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
 {
uint64_t delta;
 
+   /* With a TSC frequency of 2.5GHz, delta * NSEC_PER_SEC will
+* wrap in under 8 seconds. Once half that time has elapsed
+* reread the system clock and TSC to ensure wrapping does not
+* occur.
+*/
delta = cycles - pcapng_time.cycles;
+   if (delta > ((1ULL << 63) / NSEC_PER_SEC)) {
+   pcapng_init();
+   if (cycles > pcapng_time.cycles)
+   delta = cycles - pcapng_time.cycles;
+   else {
+   delta = pcapng_time.cycles - cycles;
+   return pcapng_time.ns - (delta * NSEC_PER_SEC) / 
rte_get_tsc_hz();
+   }
+   }
return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
 }
 
-- 
2.34.1



Re: [PATCH] libpcapng: fix timestamp wrapping in output files

2022-05-14 Thread Quentin Armitage
On Wed, 2022-05-11 at 09:46 -0700, Stephen Hemminger wrote:
> On Sat,  7 May 2022 17:12:36 +0100
> Quentin Armitage  wrote:
> 
> > In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
> > seconds after pcap_init is called when using a TSC with a frequency
> > of 2.5GHz.
> > 
> > To avoid the overflow, reread the time and TSC once
> > delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there
> > is no overflow if there is a several second gap between calls to
> > pcapng_tsc_to_ns() the actual check to reread the clock is:
> >   delta > ((1ULL << 63) / NSEC_PER_SEC)
> > 
> > Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
> > Cc: sta...@dpdk.org
> > 
> > Signed-off-by: Quentin Armitage 
> 
> What about something like this instead.
> 
> diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
> index 90b2f5bc6905..c5534301bf2c 100644
> --- a/lib/pcapng/rte_pcapng.c
> +++ b/lib/pcapng/rte_pcapng.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "pcapng_proto.h"
> @@ -34,27 +35,39 @@ struct rte_pcapng {
>  };
>  
>  /* For converting TSC cycles to PCAPNG ns format */
> -struct pcapng_time {
> +#define TICK_SCALE 16u
> +static struct {
> uint64_t ns;
> uint64_t cycles;
> +   struct rte_reciprocal_u64 inverse;
>  } pcapng_time;
>  
>  RTE_INIT(pcapng_init)
>  {
> struct timespec ts;
> +   uint64_t scale_tick_per_ns;
>  
> pcapng_time.cycles = rte_get_tsc_cycles();
> clock_gettime(CLOCK_REALTIME, &ts);
> pcapng_time.ns = rte_timespec_to_ns(&ts);
> +
> +   scale_tick_per_ns = (rte_get_tsc_hz() * TICK_SCALE) / NSEC_PER_SEC;
> +   pcapng_time.inverse = rte_reciprocal_value_u64(scale_tick_per_ns);
>  }
>  
>  /* PCAPNG timestamps are in nanoseconds */
>  static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
>  {
> -   uint64_t delta;
> +   uint64_t delta, elapsed;
>  
> delta = cycles - pcapng_time.cycles;
> -   return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
> +
> +   /* Compute elapsed time in nanoseconds scaled by TICK_SCALE
> +    * since the start of the capture.
> +    * With scale of 4 this will roll over in 36 years.
> +    */
> +   elapsed = rte_reciprocal_divide_u64(delta, &pcapng_time.inverse);
> +   return pcapng_time.ns + elapsed / TICK_SCALE;
>  }
>  
>  /* length of option including padding */
> 

The final statement of pcapng_tsc_to_ns() should be:
return pcapng_time.ns + elapsed * TICK_SCALE;

There is also a problem that rte_get_tsc_hz() returns eal_tsc_resolution_hz, but
this is not initialized until rte_eal_init() is called, so rte_get_tsc_hz()
cannot be called from a constructor function.

While both of the above problems can easily be solved, I think there is a
problem with accuracy with this approach. With a 3GHz clock, scale_tick_per_ns
would be 48. For other clock speeds there can be a truncation in the
calculation. With a 3.3GHz clock, scale_tick_per_ns will be truncated from 52.8
to 52, resulting in a 1.5% or so error in the time returned by
pcapng_tsc_to_ns() (a 2.3GHz clock results in a 2.2% error). Increasing
TICK_SCALE reduces the %age error, but also reduces the time before overflow
occurs.

If the approach in the following patch is considered to be acceptable, I would
be very happy to submit an updated patch. The one concern I have about the patch
is introducing a new constructor priority, RTE_PRIORITY_TIMER, which may be
considered to be inappropriate. If it is inappropriate, then the simplest
alternative would be to introduce a new function rte_tsc_get_hz_init() which
calls set_tsc_freq() if eal_tsc_resolution_hz has not been initialized
(alternatively rte_get_tsc_hz() could be modified to make the check, but that
then produces an overhead every time the function is called).

diff --git a/lib/eal/common/eal_common_timer.c
b/lib/eal/common/eal_common_timer.c
index 5686a5102b..cb3fa1e240 100644
--- a/lib/eal/common/eal_common_timer.c
+++ b/lib/eal/common/eal_common_timer.c
@@ -54,6 +54,9 @@ set_tsc_freq(void)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
uint64_t freq;
 
+   if (eal_tsc_resolution_hz)
+   return;
+
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
/*
 * Just use the primary process calculated TSC rate in any
@@ -86,3 +89,8 @@ RTE_INIT(rte_timer_init)
/* set rte_delay_us_block as a delay function */
rte_delay_us_callback_register(rte_delay_us_block);
 }
+
+RTE_INIT_PRIO(rte_tsc_init, TIMER)
+{
+   se

[PATCH v2] libpcapng: fix timestamp wrapping in output files

2022-05-17 Thread Quentin Armitage
In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
seconds after pcap_init is called when using a TSC with a frequency
of 2.5GHz.

To avoid the overflow, update the saved time and TSC value once
delta >= tsc_hz.

Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
Cc: sta...@dpdk.org

Signed-off-by: Quentin Armitage 
---
v2:
  - Don't call clock_gettime() in fast path
  - Update pcapng_time.ns and pcapng_time.cycles to ensure delta < tsc_hz
  - Stop using constructor to initialise pcapng_time.tsc_hz since
 it is not initialised until rte_eal_init() is called
  - use mean value of TSC before and after call to clock_gettime()
  - only call rte_get_tsc_hz() once
  - use rte_reciprocal functions instead of division

 lib/pcapng/rte_pcapng.c | 47 -
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
index 90b2f5bc69..06ad712bd1 100644
--- a/lib/pcapng/rte_pcapng.c
+++ b/lib/pcapng/rte_pcapng.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "pcapng_proto.h"
@@ -34,27 +35,63 @@ struct rte_pcapng {
 };
 
 /* For converting TSC cycles to PCAPNG ns format */
-struct pcapng_time {
+static struct pcapng_time {
uint64_t ns;
uint64_t cycles;
+   uint64_t tsc_hz;
+   struct rte_reciprocal_u64 tsc_hz_inverse;
 } pcapng_time;
 
-RTE_INIT(pcapng_init)
+static inline void
+pcapng_init(void)
 {
struct timespec ts;
 
pcapng_time.cycles = rte_get_tsc_cycles();
clock_gettime(CLOCK_REALTIME, &ts);
+   pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2;
pcapng_time.ns = rte_timespec_to_ns(&ts);
+
+   pcapng_time.tsc_hz = rte_get_tsc_hz();
+   pcapng_time.tsc_hz_inverse = 
rte_reciprocal_value_u64(pcapng_time.tsc_hz);
 }
 
 /* PCAPNG timestamps are in nanoseconds */
 static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
 {
-   uint64_t delta;
-
+   uint64_t delta, secs;
+
+   if (!pcapng_time.tsc_hz)
+   pcapng_init();
+
+   /* In essence the calculation is:
+*   delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / 
rte_get_tsc_hz()
+* but this overflows within 4 to 8 seconds depending on TSC frequency.
+* Instead, if delta >= pcapng_time.tsc_hz:
+*   Increase pcapng_time.ns and pcapng_time.cycles by the number of
+*   whole seconds in delta and reduce delta accordingly.
+* delta will therefore always lie in the interval [0, 
pcapng_time.tsc_hz),
+* which will not overflow when multiplied by NSEC_PER_SEC provided the
+* TSC frequency < approx 18.4GHz.
+*
+* Currently all TSCs operate below 5GHz.
+*/
delta = cycles - pcapng_time.cycles;
-   return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
+   if (unlikely(delta >= pcapng_time.tsc_hz)) {
+   if (likely(delta < pcapng_time.tsc_hz * 2)) {
+   delta -= pcapng_time.tsc_hz;
+   pcapng_time.cycles += pcapng_time.tsc_hz;
+   pcapng_time.ns += NSEC_PER_SEC;
+   } else {
+   secs = rte_reciprocal_divide_u64(delta, 
&pcapng_time.tsc_hz_inverse);
+   delta -= secs * pcapng_time.tsc_hz;
+   pcapng_time.cycles += secs * pcapng_time.tsc_hz;
+   pcapng_time.ns += secs * NSEC_PER_SEC;
+   }
+   }
+
+   return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC,
+ 
&pcapng_time.tsc_hz_inverse);
 }
 
 /* length of option including padding */
-- 
2.34.3