Hi Peng,

Peng He <xnhp0...@gmail.com> writes:

> From: Peng He <xnhp0...@gmail.com>
>
> ipf_postprocess will emit packets into the datapath pipeline ignoring
> the conntrack context, this might casuse weird issues when a packet
> batch has less space to hold all the fragments belonging to single
> packet.
>
> Given the below ruleest and consider sending a 64K ICMP packet which
> is splitted into 64 fragments.
>
> priority=1,action=drop
> priority=10,arp,action=normal
> priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2
> priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
>
> Batch 1:
> the first 32 packets will be buffered in the ipf preprocessing, nothing
> more proceeds.
>
> Batch 2:
> the second 32 packets succeed the fragment reassembly and goes to ct
> and ipf_post will emits the first 32 packets due to the limit of batch
> size.
>
> the first 32 packets goes to the datapath again due to the
> recirculation, and again buffered at ipf preprocessing before ct commit,
> then the ovs tries to call ct commit as well as ipf_postprocessing which emits
> the last 32 packets, in this case the last 32 packets will follow
> the current actions which will be sent to port 2 directly without
> recirculation and going to ipf preprocssing and ct commit again.
>
> This will cause the first 32 packets never get the chance to
> reassemble and evevntually this large ICMP packets fail to transmit.
>
> this patch fixes this issue by adding firstly ipf context to avoid
> ipf_postprocessing emits packets in the wrong context. Then by
> re-executing the action list again to emit the last 32 packets
> in the right context to correctly transmitting multiple fragments.
>
> Last, we reuse the recirc_depth to limit the times of re-execution
> as re-execution is implemented by recursive function call.
>
> Signed-off-by: Peng He <hepeng.0...@bytedance.com>
> ---

Thanks for looking into this problem - it's a current problem with the
userspace ipf implementation.

>  lib/conntrack.c         |  9 +++----
>  lib/conntrack.h         | 15 +++++------
>  lib/dpif-netdev.c       | 56 +++++++++++++++++++++++++++++++++++++----
>  lib/ipf.c               | 40 +++++++++++++++++++++++------
>  lib/ipf.h               | 11 ++++++--
>  tests/system-traffic.at | 33 ++++++++++++++++++++++++
>  tests/test-conntrack.c  |  6 ++---
>  7 files changed, 140 insertions(+), 30 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 33a1a9295..72999f1ae 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1434,14 +1434,14 @@ process_one(struct conntrack *ct, struct dp_packet 
> *pkt,
>   * connection tables.  'setmark', if not NULL, should point to a two
>   * elements array containing a value and a mask to set the connection mark.
>   * 'setlabel' behaves similarly for the connection label.*/
> -int
> +bool
>  conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
>                    ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
>                    const uint32_t *setmark,
>                    const struct ovs_key_ct_labels *setlabel,
>                    ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
>                    const struct nat_action_info_t *nat_action_info,
> -                  long long now, uint32_t tp_id)
> +                  long long now, uint32_t tp_id, struct ipf_ctx *ipf_ctx)
>  {
>      ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
>                               ct->hash_basis);
> @@ -1468,9 +1468,8 @@ conntrack_execute(struct conntrack *ct, struct 
> dp_packet_batch *pkt_batch,
>          }
>      }
>  
> -    ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
> -
> -    return 0;
> +    return ipf_postprocess_conntrack(ct->ipf, pkt_batch, ipf_ctx, \
> +                                     now, dl_type);
>  }
>  
>  void
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index 9553b188a..211efad3f 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -64,6 +64,7 @@
>  struct dp_packet_batch;
>  
>  struct conntrack;
> +struct ipf_ctx;
>  
>  union ct_addr {
>      ovs_be32 ipv4;
> @@ -88,13 +89,13 @@ struct nat_action_info_t {
>  struct conntrack *conntrack_init(void);
>  void conntrack_destroy(struct conntrack *);
>  
> -int conntrack_execute(struct conntrack *ct, struct dp_packet_batch 
> *pkt_batch,
> -                      ovs_be16 dl_type, bool force, bool commit, uint16_t 
> zone,
> -                      const uint32_t *setmark,
> -                      const struct ovs_key_ct_labels *setlabel,
> -                      ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
> -                      const struct nat_action_info_t *nat_action_info,
> -                      long long now, uint32_t tp_id);
> +bool conntrack_execute(struct conntrack *ct, struct dp_packet_batch 
> *pkt_batch,
> +                       ovs_be16 dl_type, bool force, bool commit,
> +                       uint16_t zone, const uint32_t *setmark,
> +                       const struct ovs_key_ct_labels *setlabel,
> +                       ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
> +                       const struct nat_action_info_t *nat_action_info,
> +                       long long now, uint32_t tp_id, struct ipf_ctx 
> *ipf_ctx);
>  void conntrack_clear(struct dp_packet *packet);
>  
>  struct conntrack_dump {
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 98453a206..c4c837e2e 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -7756,6 +7756,7 @@ dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
>  struct dp_netdev_execute_aux {
>      struct dp_netdev_pmd_thread *pmd;
>      const struct flow *flow;
> +    const struct nlattr *redo_actions;
>  };
>  
>  static void
> @@ -8285,10 +8286,21 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
> *packets_,
>              VLOG_WARN_RL(&rl, "NAT specified without commit.");
>          }
>  
> -        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
> -                          commit, zone, setmark, setlabel, aux->flow->tp_src,
> -                          aux->flow->tp_dst, helper, nat_action_info_ref,
> -                          pmd->ctx.now / 1000, tp_id);
> +        bool more_pkts;
> +        struct ipf_ctx ipf_ctx = { aux->flow->recirc_id,
> +                                   aux->flow->in_port,
> +                                   zone };
> +        more_pkts = conntrack_execute(dp->conntrack, packets_,
> +                                      aux->flow->dl_type,
> +                                      force, commit, zone,
> +                                      setmark, setlabel,
> +                                      aux->flow->tp_src,
> +                                      aux->flow->tp_dst,
> +                                      helper, nat_action_info_ref,
> +                                      pmd->ctx.now / 1000, tp_id, &ipf_ctx);
> +        if (more_pkts) {
> +            aux->redo_actions = a;
> +        }
>          break;
>      }
>  
> @@ -8322,16 +8334,50 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
> *packets_,
>      dp_packet_delete_batch(packets_, should_steal);
>  }
>  
> +static size_t
> +dp_netdev_find_actions_left(const struct nlattr *actions,
> +                            size_t actions_len,
> +                            const struct nlattr *target)
> +{
> +    const struct nlattr *a;
> +    size_t left;
> +    NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) {
> +        if (a == target) {
> +            break;
> +        }
> +    }
> +    return left;
> +}
> +
>  static void
>  dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
>                            struct dp_packet_batch *packets,
>                            bool should_steal, const struct flow *flow,
>                            const struct nlattr *actions, size_t actions_len)
>  {
> -    struct dp_netdev_execute_aux aux = { pmd, flow };
> +    struct dp_netdev_execute_aux aux = { pmd, flow, NULL };
>  
>      odp_execute_actions(&aux, packets, should_steal, actions,
>                          actions_len, dp_execute_cb);

I'm concerned here - we currently have an issue with stack explosion in
some corner cases during normal packet processing.  I'm worried that we
just expanded it to another place.

Any thought to doing this iteratively instead?

> +
> +    if (OVS_UNLIKELY(aux.redo_actions)) {
> +        uint32_t *depth = recirc_depth_get();
> +        /* reuse depth to limit re-execution times to avoid too
> +         * large fragments.
> +         */
> +        if (*depth < MAX_RECIRC_DEPTH) {
> +            struct dp_packet_batch batch;
> +            dp_packet_batch_init(&batch);
> +            size_t redo_actions_len = \
> +                                      dp_netdev_find_actions_left(actions,
> +                                              actions_len,
> +                                              aux.redo_actions);
> +            (*depth) ++;
> +            dp_netdev_execute_actions(pmd, &batch, should_steal, flow,
> +                    aux.redo_actions, redo_actions_len);
> +            (*depth) --;
> +        }
> +    }
>  }
>  
>  struct dp_netdev_ct_dump {
> diff --git a/lib/ipf.c b/lib/ipf.c
> index 507db2aea..1fd3d8d30 100644
> --- a/lib/ipf.c
> +++ b/lib/ipf.c
> @@ -1046,30 +1046,52 @@ ipf_send_frags_in_list(struct ipf *ipf, struct 
> ipf_list *ipf_list,
>      OVS_NOT_REACHED();
>  }
>  
> +static bool
> +ipf_ctx_eq(struct ipf_list *ipf_list, struct ipf_ctx *ctx)
> +{
> +    struct dp_packet *pkt =
> +        ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
> +
> +    if (pkt->md.recirc_id != ctx->recirc_id ||
> +            pkt->md.in_port.odp_port != ctx->in_port.odp_port ||
> +            pkt->md.ct_zone != ctx->zone) {
> +        return false;
> +    }
> +    return true;
> +}
> +
>  /* Adds fragments associated with a completed fragment list to a packet batch
>   * to be processed by the calling application, typically conntrack. Also
>   * cleans up the list context when it is empty.*/
> -static void
> +static bool
>  ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
> -                         long long now, bool v6)
> +                         struct ipf_ctx *ctx, long long now, bool v6)
>  {
>      if (ovs_list_is_empty(&ipf->frag_complete_list)) {
> -        return;
> +        return false;
>      }
>  
> +    bool more_pkts = false;
>      ovs_mutex_lock(&ipf->ipf_lock);
>      struct ipf_list *ipf_list, *next;
>  
>      LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) 
> {
> +
> +        if (ctx && !ipf_ctx_eq(ipf_list, ctx)) {
> +            continue;
> +        }
> +
>          if (ipf_send_frags_in_list(ipf, ipf_list, pb, 
> IPF_FRAG_COMPLETED_LIST,
>                                     v6, now)) {
>              ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
>          } else {
> +            more_pkts = true;
>              break;
>          }
>      }
>  
>      ovs_mutex_unlock(&ipf->ipf_lock);
> +    return more_pkts;
>  }
>  
>  /* Conservatively adds fragments associated with a expired fragment list to
> @@ -1225,8 +1247,8 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
>   * be added to the batch to be sent through conntrack. */
>  void
>  ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> -                         long long now, ovs_be16 dl_type, uint16_t zone,
> -                         uint32_t hash_basis)
> +                         long long now, ovs_be16 dl_type,
> +                         uint16_t zone, uint32_t hash_basis)
>  {
>      if (ipf_get_enabled(ipf)) {
>          ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, 
> hash_basis);
> @@ -1241,16 +1263,18 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct 
> dp_packet_batch *pb,
>   * through conntrack and adds these fragments to any batches seen.  Expired
>   * fragments are marked as invalid and also added to the batches seen
>   * with low priority.  Reassembled packets are freed. */
> -void
> +bool
>  ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> -                          long long now, ovs_be16 dl_type)
> +                          struct ipf_ctx *ctx, long long now, ovs_be16 
> dl_type)
>  {
> +    bool more_pkts = false;
>      if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
>          bool v6 = dl_type == htons(ETH_TYPE_IPV6);
>          ipf_post_execute_reass_pkts(ipf, pb, v6);
> -        ipf_send_completed_frags(ipf, pb, now, v6);
> +        more_pkts = ipf_send_completed_frags(ipf, pb, ctx, now, v6);
>          ipf_send_expired_frags(ipf, pb, now, v6);
>      }
> +    return more_pkts;
>  }
>  
>  static void *
> diff --git a/lib/ipf.h b/lib/ipf.h
> index 6ac91b270..7bbf453af 100644
> --- a/lib/ipf.h
> +++ b/lib/ipf.h
> @@ -40,14 +40,21 @@ struct ipf_status {
>     unsigned int nfrag_max;
>  };
>  
> +struct ipf_ctx {
> +    uint32_t recirc_id;
> +    union flow_in_port in_port; /* Input port. */
> +    uint16_t zone;
> +};
> +
>  struct ipf *ipf_init(void);
>  void ipf_destroy(struct ipf *ipf);
>  void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
>                                long long now, ovs_be16 dl_type, uint16_t zone,
>                                uint32_t hash_basis);
>  
> -void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> -                               long long now, ovs_be16 dl_type);
> +bool ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> +                               struct ipf_ctx *ctx, long long now,
> +                               ovs_be16 dl_type);
>  
>  int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable);
>  int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value);
> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> index a69896d33..a09318882 100644
> --- a/tests/system-traffic.at
> +++ b/tests/system-traffic.at
> @@ -2741,6 +2741,39 @@ DPCTL_CHECK_FRAGMENTATION_PASS()
>  OVS_TRAFFIC_VSWITCHD_STOP
>  AT_CLEANUP
>  
> +AT_SETUP([conntrack - IPv4 large fragmentation])
> +CHECK_CONNTRACK()
> +OVS_TRAFFIC_VSWITCHD_START()
> +
> +ADD_NAMESPACES(at_ns0, at_ns1)
> +
> +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
> +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
> +
> +dnl Sending ping through conntrack
> +AT_DATA([flows.txt], [dnl
> +priority=1,action=drop
> +priority=10,arp,action=normal
> +priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> +priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2
> +priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> +priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> +priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> +])
> +
> +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
> +
> +dnl Modify userspace conntrack fragmentation handling.
> +DPCTL_MODIFY_FRAGMENTATION()
> +
> +dnl Ipv4 larger fragmentation connectivity check.
> +NS_CHECK_EXEC([at_ns0], [ping -s 65000 -q -c 3 -i 0.3 -w 2 10.1.1.2 | 
> FORMAT_PING], [0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +OVS_TRAFFIC_VSWITCHD_STOP
> +AT_CLEANUP
> +
>  AT_SETUP([conntrack - IPv4 fragmentation expiry])
>  CHECK_CONNTRACK()
>  OVS_TRAFFIC_VSWITCHD_START()
> diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c
> index 24c93e4a4..f90b7aa78 100644
> --- a/tests/test-conntrack.c
> +++ b/tests/test-conntrack.c
> @@ -91,7 +91,7 @@ ct_thread_main(void *aux_)
>      ovs_barrier_block(&barrier);
>      for (i = 0; i < n_pkts; i += batch_size) {
>          conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL, NULL,
> -                          0, 0, NULL, NULL, now, 0);
> +                          0, 0, NULL, NULL, now, 0, NULL);
>          DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) {
>              pkt_metadata_init_conn(&pkt->md);
>          }
> @@ -178,7 +178,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_,
>  
>          if (flow.dl_type != dl_type) {
>              conntrack_execute(ct_, &new_batch, dl_type, false, true, 0,
> -                              NULL, NULL, 0, 0, NULL, NULL, now, 0);
> +                              NULL, NULL, 0, 0, NULL, NULL, now, 0, NULL);
>              dp_packet_batch_init(&new_batch);
>          }
>          dp_packet_batch_add(&new_batch, packet);
> @@ -186,7 +186,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_,
>  
>      if (!dp_packet_batch_is_empty(&new_batch)) {
>          conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, NULL, 
> NULL,
> -                          0, 0, NULL, NULL, now, 0);
> +                          0, 0, NULL, NULL, now, 0, NULL);
>      }
>  
>  }

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to