Added "csum txprep (on|off)" command which allows to switch to the
tx path using Tx preparation API.

By default unchanged implementation is used.

Using Tx preparation path, pseudo header calculation for udp/tcp/tso
packets from application, and used Tx preparation API for
packet preparation and verification.

Adding additional step to the csum engine costs about 3-4% of performance
drop, on my setup with ixgbe driver. It's caused mostly by the need
of reaccessing and modification of packet data.

Signed-off-by: Tomasz Kulasek <tomaszx.kulasek at intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
---
 app/test-pmd/cmdline.c  |   49 +++++++++++++++++++++++++++++++++++++++++++++++
 app/test-pmd/csumonly.c |   33 ++++++++++++++++++++++++-------
 app/test-pmd/testpmd.c  |    5 +++++
 app/test-pmd/testpmd.h  |    2 ++
 4 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 63b55dc..373fc59 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -366,6 +366,10 @@ static void cmd_help_long_parsed(void *parsed_result,
                        "csum show (port_id)\n"
                        "    Display tx checksum offload configuration\n\n"

+                       "csum txprep (on|off)"
+                       "    Enable tx preparation path in csum forward engine"
+                       "\n\n"
+
                        "tso set (segsize) (portid)\n"
                        "    Enable TCP Segmentation Offload in csum forward"
                        " engine.\n"
@@ -3523,6 +3527,50 @@ struct cmd_csum_tunnel_result {
        },
 };

+/* Enable/disable tx preparation path */
+struct cmd_csum_txprep_result {
+       cmdline_fixed_string_t csum;
+       cmdline_fixed_string_t parse;
+       cmdline_fixed_string_t onoff;
+};
+
+static void
+cmd_csum_txprep_parsed(void *parsed_result,
+                      __attribute__((unused)) struct cmdline *cl,
+                      __attribute__((unused)) void *data)
+{
+       struct cmd_csum_txprep_result *res = parsed_result;
+
+       if (!strcmp(res->onoff, "on"))
+               tx_prepare = 1;
+       else
+               tx_prepare = 0;
+
+}
+
+cmdline_parse_token_string_t cmd_csum_txprep_csum =
+       TOKEN_STRING_INITIALIZER(struct cmd_csum_txprep_result,
+                               csum, "csum");
+cmdline_parse_token_string_t cmd_csum_txprep_parse =
+       TOKEN_STRING_INITIALIZER(struct cmd_csum_txprep_result,
+                               parse, "txprep");
+cmdline_parse_token_string_t cmd_csum_txprep_onoff =
+       TOKEN_STRING_INITIALIZER(struct cmd_csum_txprep_result,
+                               onoff, "on#off");
+
+cmdline_parse_inst_t cmd_csum_txprep = {
+       .f = cmd_csum_txprep_parsed,
+       .data = NULL,
+       .help_str = "enable/disable tx preparation path for csum engine: "
+       "csum txprep on|off",
+       .tokens = {
+               (void *)&cmd_csum_txprep_csum,
+               (void *)&cmd_csum_txprep_parse,
+               (void *)&cmd_csum_txprep_onoff,
+               NULL,
+       },
+};
+
 /* *** ENABLE HARDWARE SEGMENTATION IN TX NON-TUNNELED PACKETS *** */
 struct cmd_tso_set_result {
        cmdline_fixed_string_t tso;
@@ -11470,6 +11518,7 @@ struct cmd_set_vf_mac_addr_result {
        (cmdline_parse_inst_t *)&cmd_csum_set,
        (cmdline_parse_inst_t *)&cmd_csum_show,
        (cmdline_parse_inst_t *)&cmd_csum_tunnel,
+       (cmdline_parse_inst_t *)&cmd_csum_txprep,
        (cmdline_parse_inst_t *)&cmd_tso_set,
        (cmdline_parse_inst_t *)&cmd_tso_show,
        (cmdline_parse_inst_t *)&cmd_tunnel_tso_set,
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 57e6ae2..3afa9ab 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -372,8 +372,10 @@ struct simple_gre_hdr {
                        udp_hdr->dgram_cksum = 0;
                        if (testpmd_ol_flags & TESTPMD_TX_OFFLOAD_UDP_CKSUM) {
                                ol_flags |= PKT_TX_UDP_CKSUM;
-                               udp_hdr->dgram_cksum = get_psd_sum(l3_hdr,
-                                       info->ethertype, ol_flags);
+                               if (!tx_prepare)
+                                       udp_hdr->dgram_cksum = get_psd_sum(
+                                                       l3_hdr, info->ethertype,
+                                                       ol_flags);
                        } else {
                                udp_hdr->dgram_cksum =
                                        get_udptcp_checksum(l3_hdr, udp_hdr,
@@ -385,12 +387,15 @@ struct simple_gre_hdr {
                tcp_hdr->cksum = 0;
                if (tso_segsz) {
                        ol_flags |= PKT_TX_TCP_SEG;
-                       tcp_hdr->cksum = get_psd_sum(l3_hdr, info->ethertype,
-                               ol_flags);
+                       if (!tx_prepare)
+                               tcp_hdr->cksum = get_psd_sum(l3_hdr,
+                                               info->ethertype, ol_flags);
+
                } else if (testpmd_ol_flags & TESTPMD_TX_OFFLOAD_TCP_CKSUM) {
                        ol_flags |= PKT_TX_TCP_CKSUM;
-                       tcp_hdr->cksum = get_psd_sum(l3_hdr, info->ethertype,
-                               ol_flags);
+                       if (!tx_prepare)
+                               tcp_hdr->cksum = get_psd_sum(l3_hdr,
+                                               info->ethertype, ol_flags);
                } else {
                        tcp_hdr->cksum =
                                get_udptcp_checksum(l3_hdr, tcp_hdr,
@@ -648,6 +653,7 @@ struct simple_gre_hdr {
        void *l3_hdr = NULL, *outer_l3_hdr = NULL; /* can be IPv4 or IPv6 */
        uint16_t nb_rx;
        uint16_t nb_tx;
+       uint16_t nb_prep;
        uint16_t i;
        uint64_t rx_ol_flags, tx_ol_flags;
        uint16_t testpmd_ol_flags;
@@ -857,7 +863,20 @@ struct simple_gre_hdr {
                        printf("\n");
                }
        }
-       nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+
+       if (tx_prepare) {
+               nb_prep = rte_eth_tx_prepare(fs->tx_port, fs->tx_queue,
+                               pkts_burst, nb_rx);
+               if (nb_prep != nb_rx)
+                       printf("Preparing packet burst to transmit failed: 
%s\n",
+                                       rte_strerror(rte_errno));
+
+               nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
+                               nb_prep);
+       } else
+               nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
+                               nb_rx);
+
        /*
         * Retry if necessary
         */
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index a0332c2..c18bc28 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -180,6 +180,11 @@ struct fwd_engine * fwd_engines[] = {
 enum tx_pkt_split tx_pkt_split = TX_PKT_SPLIT_OFF;
 /**< Split policy for packets to TX. */

+/*
+ * Enable Tx preparation path in the "csum" engine.
+ */
+uint8_t tx_prepare = 0;
+
 uint16_t nb_pkt_per_burst = DEF_PKT_BURST; /**< Number of packets per burst. */
 uint16_t mb_mempool_cache = DEF_MBUF_CACHE; /**< Size of mbuf mempool cache. */

diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 9c1e703..488a6e1 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -383,6 +383,8 @@ enum tx_pkt_split {

 extern enum tx_pkt_split tx_pkt_split;

+extern uint8_t tx_prepare;
+
 extern uint16_t nb_pkt_per_burst;
 extern uint16_t mb_mempool_cache;
 extern int8_t rx_pthresh;
-- 
1.7.9.5

Reply via email to