From: Andy Zhou <az...@nicira.com>

The conntrack action now re-assembles fragmented IPv4 packets and only
send a fully re-assembled IP packet to nf_conntrack layer.

When a re-assembled IP frame hits the output action. The output action
will re fragment them into IP fragments based on this packets' incoming
fragment size.

Signed-off-by: Andy Zhou <az...@nicira.com>
---
 include/uapi/linux/openvswitch.h |    5 ++-
 net/openvswitch/actions.c        |   78 ++++++++++++++++++++++++++++++++++----
 net/openvswitch/conntrack.c      |   43 ++++++++++++++++++++-
 net/openvswitch/datapath.c       |   40 ++++++++++++++++---
 net/openvswitch/datapath.h       |    6 +++
 net/openvswitch/vport.c          |    1 +
 6 files changed, 157 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 30d70a3..b947544 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -162,7 +162,9 @@ enum ovs_packet_cmd {
  * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
- *
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
  */
@@ -178,6 +180,7 @@ enum ovs_packet_attr {
        OVS_PACKET_ATTR_UNUSED2,
        OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
                                       error logging should be suppressed. */
+       OVS_PACKET_ATTR_MRU,          /* Maximum received IP fragment size. */
        __OVS_PACKET_ATTR_MAX
 };
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9bd9f99..789e53a 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -53,6 +53,11 @@ struct deferred_action {
        struct sw_flow_key pkt_key;
 };
 
+struct vport_frag_output_info {
+       struct vport *vport;
+       struct sw_flow_key *key;
+};
+
 #define DEFERRED_ACTION_FIFO_SIZE 10
 struct action_fifo {
        int head;
@@ -595,14 +600,67 @@ static int set_sctp(struct sk_buff *skb, struct 
sw_flow_key *flow_key,
        return 0;
 }
 
-static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+/* Given an IP frame, reconstruct its MAC header based on flow.  */
+int ovs_setup_l2_header(struct sk_buff *skb, struct sw_flow_key *key)
+{
+       int err;
+
+       err = skb_ensure_writable(skb, ETH_HLEN);
+       if (unlikely(err))
+               return err;
+
+       __skb_push(skb, ETH_HLEN);
+       skb_reset_mac_header(skb);
+
+       ether_addr_copy(eth_hdr(skb)->h_source, key->eth.src);
+       ether_addr_copy(eth_hdr(skb)->h_dest, key->eth.dst);
+       eth_hdr(skb)->h_proto = key->eth.type;
+
+       return 0;
+}
+
+static int ovs_vport_output(struct sk_buff *skb, void *output_arg)
+{
+       struct vport_frag_output_info *arg =
+               (struct vport_frag_output_info *)output_arg;
+       struct sw_flow_key *key = arg->key;
+       struct vport *vport = arg->vport;
+       int err;
+
+       err = ovs_setup_l2_header(skb, key);
+       if (err) {
+               kfree_skb(skb);
+               return err;
+       }
+       ovs_vport_send(vport, skb);
+
+       return 0;
+}
+
+static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
+                     struct sw_flow_key *key)
 {
        struct vport *vport = ovs_vport_rcu(dp, out_port);
+       unsigned int mru = OVS_CB(skb)->mru;
 
-       if (likely(vport))
-               ovs_vport_send(vport, skb);
-       else
+       if (likely(vport)) {
+               if (!mru || (skb->len <= mru + ETH_HLEN)) {
+                       ovs_vport_send(vport, skb);
+               } else if (key->eth.type == htons(ETH_P_IP)) {
+                       struct vport_frag_output_info arg;
+                       unsigned int mtu = mru;
+
+                       arg.vport = vport;
+                       arg.key = key;
+
+                       skb_pull(skb, ETH_HLEN);
+
+                       ip_fragment_mtu(skb, mtu, LL_MAX_HEADER, NULL, &arg,
+                                       ovs_vport_output);
+               }
+       } else {
                kfree_skb(skb);
+       }
 }
 
 static int output_userspace(struct datapath *dp, struct sk_buff *skb,
@@ -617,6 +675,7 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
        upcall.userdata = NULL;
        upcall.portid = 0;
        upcall.egress_tun_info = NULL;
+       upcall.mru = OVS_CB(skb)->mru;
 
        for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
                 a = nla_next(a, &rem)) {
@@ -865,7 +924,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
                        struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
 
                        if (out_skb)
-                               do_output(dp, out_skb, prev_port);
+                               do_output(dp, out_skb, prev_port, key);
 
                        prev_port = -1;
                }
@@ -929,13 +988,18 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
                }
 
                if (unlikely(err)) {
-                       kfree_skb(skb);
+                       /* Hide stolen fragments from user space. */
+                       if (err == -EINPROGRESS)
+                               err = 0;
+                       else
+                               kfree_skb(skb);
+
                        return err;
                }
        }
 
        if (prev_port != -1)
-               do_output(dp, skb, prev_port);
+               do_output(dp, skb, prev_port, key);
        else
                consume_skb(skb);
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 93d76a5..793d489 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -178,21 +178,60 @@ static int ovs_ct_lookup(struct net *net, u16 zone, 
struct sw_flow_key *key,
        return err;
 }
 
+static int handle_fragments(struct net *net, u16 zone, struct sk_buff *skb,
+                           struct sw_flow_key *key)
+{
+       if (key->eth.type == htons(ETH_P_IP)) {
+               if (ip_is_fragment(ip_hdr(skb))) {
+                       struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
+                       int nh_ofs = skb_network_offset(skb);
+                       enum ip_defrag_users user;
+                       unsigned int mru;
+                       int err;
+
+                       memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+                       user = IP_DEFRAG_CONNTRACK_IN + zone;
+                       skb_pull(skb, nh_ofs);
+                       err = ip_defrag_net(net, skb, user, &mru);
+                       if (err)
+                               return err;
+
+                       /* Got a reassembled IP frame */
+                       skb_clear_hash(skb);
+                       ip_send_check(ip_hdr(skb));
+                       skb->ignore_df = 1;
+                       err = ovs_setup_l2_header(skb, key);
+                       if (err)
+                               return err;
+
+                       ovs_cb.mru = mru;
+                       *OVS_CB(skb) = ovs_cb;
+               }
+       } /* XXX Handle IPv6 */
+
+       return 0;
+}
+
 int ovs_ct_execute(struct sk_buff *skb, struct sw_flow_key *key,
                   const struct ovs_conntrack_info *info)
 {
        struct net *net;
-       int nh_ofs = skb_network_offset(skb);
        struct nf_conn *tmpl = info->ct;
-       int err = -EINVAL;
+       int nh_ofs, err;
 
        net = ovs_get_net(skb);
        if (IS_ERR(net))
                return PTR_ERR(net);
 
+       err = handle_fragments(net, info->zone, skb, key);
+       if (err)
+               return err;
+
        /* The conntrack module expects to be working at L3. */
+       nh_ofs = skb_network_offset(skb);
        skb_pull(skb, nh_ofs);
 
+       err = -EINVAL;
        if (ovs_ct_lookup__(net, tmpl, key, skb))
                goto err_push_skb;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 46f67ee..1340f21 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -277,6 +277,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
sw_flow_key *key)
                upcall.userdata = NULL;
                upcall.portid = ovs_vport_find_upcall_portid(p, skb);
                upcall.egress_tun_info = NULL;
+               upcall.mru = OVS_CB(skb)->mru;
                error = ovs_dp_upcall(dp, skb, key, &upcall);
                if (unlikely(error))
                        kfree_skb(skb);
@@ -398,9 +399,23 @@ static size_t upcall_msg_size(const struct dp_upcall_info 
*upcall_info,
        if (upcall_info->egress_tun_info)
                size += nla_total_size(ovs_tun_key_attr_size());
 
+       /* OVS_PACKET_ATTR_MRU */
+       if (upcall_info->mru)
+               size += nla_total_size(sizeof(unsigned int));
+
        return size;
 }
 
+static void pad_packet(struct datapath *dp, struct sk_buff *skb)
+{
+       if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+               size_t plen = NLA_ALIGN(skb->len) - skb->len;
+
+               if (plen > 0)
+                       memset(skb_put(skb, plen), 0, plen);
+       }
+}
+
 static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
                                  const struct sw_flow_key *key,
                                  const struct dp_upcall_info *upcall_info)
@@ -479,6 +494,16 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *skb,
                nla_nest_end(user_skb, nla);
        }
 
+       /* Add OVS_PACKET_ATTR_MRU */
+       if (upcall_info->mru) {
+               if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
+                               upcall_info->mru)) {
+                       err = -ENOBUFS;
+                       goto out;
+               }
+               pad_packet(dp, user_skb);
+       }
+
        /* Only reserve room for attribute header, packet data is added
         * in skb_zerocopy() */
        if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
@@ -492,12 +517,7 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *skb,
                goto out;
 
        /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
-       if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
-               size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
-
-               if (plen > 0)
-                       memset(skb_put(user_skb, plen), 0, plen);
-       }
+       pad_packet(dp, user_skb);
 
        ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 
@@ -526,6 +546,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, 
struct genl_info *info)
        int len;
        int err;
        bool log = !a[OVS_PACKET_ATTR_PROBE];
+       unsigned int mru;
 
        err = -EINVAL;
        if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
@@ -552,6 +573,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, 
struct genl_info *info)
        else
                packet->protocol = htons(ETH_P_802_2);
 
+       /* Set packet's mru */
+       mru = 0;
+       if (a[OVS_PACKET_ATTR_MRU])
+               mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
+       OVS_CB(packet)->mru = mru;
+
        /* Build an sw_flow for sending this packet. */
        flow = ovs_flow_alloc();
        err = PTR_ERR(flow);
@@ -612,6 +639,7 @@ static const struct nla_policy 
packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
        [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
        [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
        [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
+       [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
 };
 
 static const struct genl_ops dp_packet_genl_ops[] = {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 9661a01..cfbdda1 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -98,10 +98,13 @@ struct datapath {
  * NULL if the packet is not being tunneled.
  * @input_vport: The original vport packet came in on. This value is cached
  * when a packet is received by OVS.
+ * @mru: The maximum received fragement size; 0 if the packet is not
+ * fragmented.
  */
 struct ovs_skb_cb {
        struct ovs_tunnel_info  *egress_tun_info;
        struct vport            *input_vport;
+       unsigned int            mru;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
@@ -114,12 +117,14 @@ struct ovs_skb_cb {
  * then no packet is sent and the packet is accounted in the datapath's @n_lost
  * counter.
  * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
+ * @mru: If not zero, Maximum received IP fragment size.
  */
 struct dp_upcall_info {
        const struct ovs_tunnel_info *egress_tun_info;
        const struct nlattr *userdata;
        u32 portid;
        u8 cmd;
+       unsigned int mru;
 };
 
 /**
@@ -198,6 +203,7 @@ void ovs_dp_notify_wq(struct work_struct *work);
 
 int action_fifos_init(void);
 void action_fifos_exit(void);
+int ovs_setup_l2_header(struct sk_buff *skb, struct sw_flow_key *key);
 
 /* 'KEY' must not have any bits set outside of the 'MASK' */
 #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index ec2954f..184dd51 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -486,6 +486,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff 
*skb,
 
        OVS_CB(skb)->input_vport = vport;
        OVS_CB(skb)->egress_tun_info = NULL;
+       OVS_CB(skb)->mru = 0;
        /* Extract flow from 'skb' into 'key'. */
        error = ovs_flow_key_extract(tun_info, skb, &key);
        if (unlikely(error)) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to