On Wed, Sep 11, 2019 at 2:53 AM Ankur Sharma <ankur.sha...@nutanix.com> wrote:
> During E-W routing for vlan backed networks, we replace router port > mac with chassis mac, when packet leaves the source hypervisor. > > As a result, the destination VM (on remote hypervisor) will see > chassis mac as source mac in received packet. > > Although, functionality wise this does not cause any issue, > however chassis mac being see as source on VM, will > lead to following: > a. INCONSISTENT SOURCE MAC: > If the destination VM moves to same hypervisor as sender, > then it will see router port mac as source mac. Whereas, on > a remote hypervisor, source mac will be the sender chassis mac. > > This will cause inconsistency in packet headers for the same > flow and could be confusing for someone looking at packet > captures inside the vm. > > b. SYSTEM MAC BEING EXPOSED TO VM: > Chassis mac is a CMS provided mac, i.e it is an infrastructure > mac. It is not a good practice to expose such values to VM, > which should not be seeing them in first place. > > In order to replace chassis mac with router port mac, we will > do following. > > a. Create conjunction for each chassis mac and router port vlan > id combination. For example, for a 2 node chassis setup, where > we have a logical router, connected to 4 logical switches with > vlan ids: 2000, 1000, 0 and 24, the conjunction flow will look > like following: > > cookie=0x0, duration=9094.608s, table=0, n_packets=0, n_bytes=0, > idle_age=9094, priority=180,dl_src=aa:bb:cc:dd:ee:22 > actions=conjunction(100,1/2) > cookie=0x0, duration=9094.608s, table=0, n_packets=0, n_bytes=0, > idle_age=9094, priority=180,dl_src=aa:bb:cc:dd:ff:ee > actions=conjunction(100,1/2) > > cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, > idle_age=9094, priority=180,dl_vlan=2000 actions=conjunction(100,2/2) > cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, > idle_age=9094, priority=180,dl_vlan=1000 actions=conjunction(100,2/2) > cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, > idle_age=9094, priority=180,vlan_tci=0x0000/0x1fff > actions=conjunction(100,2/2) > cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, > idle_age=9094, priority=180,dl_vlan=24 actions=conjunction(100,2/2) > > b. Using this conjunction as match, we can identify if packet entering > destination > hypervisor was routed at the source or not. This will be done in > table=0 (Physical to logical) > at priority=180. > For example: > cookie=0x0, duration=9795.957s, table=0, n_packets=1391, > n_bytes=141882, idle_age=8396, > priority=180,conj_id=100,in_port=146,dl_vlan=1000 > actions=.........,mod_dl_src:00:00:01:01:02:03,... > > c. We use conjunction, as it will ensure that we do not end up having lot > of flows > as we scale up. If we do not use conjunction, then we will have > N (number of chassis macs) X M (number of router vlans) number of ovs > flows. > Conjunction converts it to N + M. > Consider a setup, with 500 Chassis and 500 routed vlans. > Without conjunction we will need 25000 (500 * 500) flows, > whereas with conjunction that number comes down to 1000 (500 + 500). > > Signed-off-by: Ankur Sharma <ankur.sha...@nutanix.com> > Thanks. I applied this to master. Numan > --- > controller/chassis.c | 2 +- > controller/chassis.h | 3 + > controller/ovn-controller.c | 5 + > controller/physical.c | 222 > ++++++++++++++++++++++++++++++++++++++++++-- > controller/physical.h | 1 + > ovn-architecture.7.xml | 10 +- > tests/ovn.at | 14 ++- > 7 files changed, 244 insertions(+), 13 deletions(-) > > diff --git a/controller/chassis.c b/controller/chassis.c > index 937c557..699b662 100644 > --- a/controller/chassis.c > +++ b/controller/chassis.c > @@ -144,7 +144,7 @@ get_bridge_mappings(const struct smap *ext_ids) > return smap_get_def(ext_ids, "ovn-bridge-mappings", ""); > } > > -static const char * > +const char * > get_chassis_mac_mappings(const struct smap *ext_ids) > { > return smap_get_def(ext_ids, "ovn-chassis-mac-mappings", ""); > diff --git a/controller/chassis.h b/controller/chassis.h > index eb46ca3..178d295 100644 > --- a/controller/chassis.h > +++ b/controller/chassis.h > @@ -27,6 +27,7 @@ struct sbrec_chassis; > struct sbrec_chassis_table; > struct sset; > struct eth_addr; > +struct smap; > > void chassis_register_ovs_idl(struct ovsdb_idl *); > const struct sbrec_chassis *chassis_run( > @@ -42,5 +43,7 @@ bool chassis_get_mac(const struct sbrec_chassis *chassis, > const char *bridge_mapping, > struct eth_addr *chassis_mac); > const char *chassis_get_id(void); > +const char * get_chassis_mac_mappings(const struct smap *ext_ids); > + > > #endif /* controller/chassis.h */ > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > index e27b56b..b5449b8 100644 > --- a/controller/ovn-controller.c > +++ b/controller/ovn-controller.c > @@ -1268,9 +1268,14 @@ en_flow_output_run(struct engine_node *node) > (struct sbrec_port_binding_table *)EN_OVSDB_GET( > engine_get_input("SB_port_binding", node)); > > + struct sbrec_chassis_table *chassis_table = > + (struct sbrec_chassis_table *)EN_OVSDB_GET( > + engine_get_input("SB_chassis", node)); > + > physical_run(sbrec_port_binding_by_name, > multicast_group_table, > port_binding_table, > + chassis_table, > mff_ovn_geneve, > br_int, chassis, ct_zones, > local_datapaths, local_lports, > diff --git a/controller/physical.c b/controller/physical.c > index f28c5f0..43113e4 100644 > --- a/controller/physical.c > +++ b/controller/physical.c > @@ -47,9 +47,23 @@ > > VLOG_DEFINE_THIS_MODULE(physical); > > +/* Datapath zone IDs for connection tracking and NAT */ > +struct zone_ids { > + int ct; /* MFF_LOG_CT_ZONE. */ > + int dnat; /* MFF_LOG_DNAT_ZONE. */ > + int snat; /* MFF_LOG_SNAT_ZONE. */ > +}; > + > +static void > +load_logical_ingress_metadata(const struct sbrec_port_binding *binding, > + const struct zone_ids *zone_ids, > + struct ofpbuf *ofpacts_p); > + > /* UUID to identify OF flows not associated with ovsdb rows. */ > static struct uuid *hc_uuid = NULL; > > +#define CHASSIS_MAC_TO_ROUTER_MAC_CONJID 100 > + > void > physical_register_ovs_idl(struct ovsdb_idl *ovs_idl) > { > @@ -199,12 +213,6 @@ get_localnet_port(const struct hmap *local_datapaths, > int64_t tunnel_key) > return ld ? ld->localnet_port : NULL; > } > > -/* Datapath zone IDs for connection tracking and NAT */ > -struct zone_ids { > - int ct; /* MFF_LOG_CT_ZONE. */ > - int dnat; /* MFF_LOG_DNAT_ZONE. */ > - int snat; /* MFF_LOG_SNAT_ZONE. */ > -}; > > static struct zone_ids > get_zone_ids(const struct sbrec_port_binding *binding, > @@ -388,6 +396,200 @@ put_remote_port_redirect_overlay(const struct > match, ofpacts_p, &binding->header_.uuid); > } > > + > +static struct hmap remote_chassis_macs = > + HMAP_INITIALIZER(&remote_chassis_macs); > + > +/* Maps from a physical network name to the chassis macs of remote > chassis. */ > +struct remote_chassis_mac { > + struct hmap_node hmap_node; > + char *chassis_mac; > + char *chassis_id; > +}; > + > +static void > +populate_remote_chassis_macs(const struct sbrec_chassis *my_chassis, > + const struct sbrec_chassis_table > *chassis_table) > +{ > + const struct sbrec_chassis *chassis; > + SBREC_CHASSIS_TABLE_FOR_EACH (chassis, chassis_table) { > + > + /* We want only remote chassis macs. */ > + if (!strcmp(my_chassis->name, chassis->name)) { > + continue; > + } > + > + const char *tokens > + = get_chassis_mac_mappings(&chassis->external_ids); > + > + if (!strlen(tokens)) { > + continue; > + } > + > + char *save_ptr = NULL; > + char *token; > + char *tokstr = xstrdup(tokens); > + > + /* Format for a chassis mac configuration is: > + * ovn-chassis-mac-mappings="bridge-name1:MAC1,bridge-name2:MAC2" > + */ > + for (token = strtok_r(tokstr, ",", &save_ptr); > + token != NULL; > + token = strtok_r(NULL, ",", &save_ptr)) { > + char *save_ptr2 = NULL; > + char *chassis_mac_bridge = strtok_r(token, ":", &save_ptr2); > + char *chassis_mac_str = strtok_r(NULL, "", &save_ptr2); > + struct remote_chassis_mac *remote_chassis_mac = NULL; > + remote_chassis_mac = xmalloc(sizeof *remote_chassis_mac); > + hmap_insert(&remote_chassis_macs, > &remote_chassis_mac->hmap_node, > + hash_string(chassis_mac_bridge, 0)); > + remote_chassis_mac->chassis_mac = xstrdup(chassis_mac_str); > + remote_chassis_mac->chassis_id = xstrdup(chassis->name); > + } > + free(tokstr); > + } > +} > + > +static void > +free_remote_chassis_macs(void) > +{ > + struct remote_chassis_mac *mac, *next_mac; > + > + HMAP_FOR_EACH_SAFE (mac, next_mac, hmap_node, &remote_chassis_macs) { > + hmap_remove(&remote_chassis_macs, &mac->hmap_node); > + free(mac->chassis_mac); > + free(mac->chassis_id); > + free(mac); > + } > +} > + > +static void > +put_chassis_mac_conj_id_flow(const struct sbrec_chassis_table > *chassis_table, > + const struct sbrec_chassis *chassis, > + struct ofpbuf *ofpacts_p, > + struct ovn_desired_flow_table *flow_table) > +{ > + struct match match; > + struct remote_chassis_mac *mac; > + > + populate_remote_chassis_macs(chassis, chassis_table); > + > + HMAP_FOR_EACH (mac, hmap_node, &remote_chassis_macs) { > + struct eth_addr chassis_mac; > + char *err_str = NULL; > + struct ofpact_conjunction *conj; > + > + if ((err_str = str_to_mac(mac->chassis_mac, &chassis_mac))) { > + free(err_str); > + free_remote_chassis_macs(); > + return; > + } > + > + ofpbuf_clear(ofpacts_p); > + match_init_catchall(&match); > + > + > + match_set_dl_src(&match, chassis_mac); > + > + conj = ofpact_put_CONJUNCTION(ofpacts_p); > + conj->id = CHASSIS_MAC_TO_ROUTER_MAC_CONJID; > + conj->n_clauses = 2; > + conj->clause = 0; > + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 180, 0, > + &match, ofpacts_p, hc_uuid); > + } > + > + free_remote_chassis_macs(); > +} > + > +static void > +put_replace_chassis_mac_flows(const struct simap *ct_zones, > + const struct > + sbrec_port_binding *localnet_port, > + const struct hmap *local_datapaths, > + struct ofpbuf *ofpacts_p, > + ofp_port_t ofport, > + struct ovn_desired_flow_table *flow_table) > +{ > + /* Packets arriving on localnet port, could have been routed on > + * source chassis and hence will have a chassis mac. > + * conj_match will match source mac with chassis macs conjunction > + * and replace it with corresponding router port mac. > + */ > + struct local_datapath *ld = get_local_datapath(local_datapaths, > + > localnet_port->datapath-> > + tunnel_key); > + ovs_assert(ld); > + > + int tag = localnet_port->tag ? *localnet_port->tag : 0; > + struct zone_ids zone_ids = get_zone_ids(localnet_port, ct_zones); > + > + for (int i = 0; i < ld->n_peer_ports; i++) { > + const struct sbrec_port_binding *rport_binding = > ld->peer_ports[i]; > + struct eth_addr router_port_mac; > + char *err_str = NULL; > + struct match match; > + struct ofpact_mac *replace_mac; > + > + ovs_assert(rport_binding->n_mac == 1); > + if ((err_str = str_to_mac(rport_binding->mac[0], > &router_port_mac))) { > + /* Parsing of mac failed. */ > + VLOG_WARN("Parsing or router port mac failed for router port: > %s, " > + "with error: %s", rport_binding->logical_port, > err_str); > + free(err_str); > + return; > + } > + ofpbuf_clear(ofpacts_p); > + match_init_catchall(&match); > + > + /* Add flow, which will match on conjunction id and will > + * replace source with router port mac */ > + > + /* Match on ingress port, vlan_id and conjunction id */ > + match_set_in_port(&match, ofport); > + match_set_conj_id(&match, CHASSIS_MAC_TO_ROUTER_MAC_CONJID); > + > + if (tag) { > + match_set_dl_vlan(&match, htons(tag), 0); > + } else { > + match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI)); > + } > + > + /* Actions */ > + > + if (tag) { > + ofpact_put_STRIP_VLAN(ofpacts_p); > + } > + load_logical_ingress_metadata(localnet_port, &zone_ids, > ofpacts_p); > + replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p); > + replace_mac->mac = router_port_mac; > + > + /* Resubmit to first logical ingress pipeline table. */ > + put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p); > + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, > + 180, 0, &match, ofpacts_p, hc_uuid); > + > + /* Provide second search criteria, i.e localnet port's > + * vlan ID for conjunction flow */ > + struct ofpact_conjunction *conj; > + ofpbuf_clear(ofpacts_p); > + match_init_catchall(&match); > + > + if (tag) { > + match_set_dl_vlan(&match, htons(tag), 0); > + } else { > + match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI)); > + } > + > + conj = ofpact_put_CONJUNCTION(ofpacts_p); > + conj->id = CHASSIS_MAC_TO_ROUTER_MAC_CONJID; > + conj->n_clauses = 2; > + conj->clause = 1; > + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 180, 0, &match, > + ofpacts_p, hc_uuid); > + } > +} > + > static void > put_replace_router_port_mac_flows(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > @@ -934,6 +1136,11 @@ consider_port_binding(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > &binding->header_.uuid); > } > > + if (!strcmp(binding->type, "localnet")) { > + put_replace_chassis_mac_flows(ct_zones, binding, > local_datapaths, > + ofpacts_p, ofport, flow_table); > + } > + > /* Table 65, Priority 100. > * ======================= > * > @@ -1227,6 +1434,7 @@ void > physical_run(struct ovsdb_idl_index *sbrec_port_binding_by_name, > const struct sbrec_multicast_group_table > *multicast_group_table, > const struct sbrec_port_binding_table *port_binding_table, > + const struct sbrec_chassis_table *chassis_table, > enum mf_field_id mff_ovn_geneve, > const struct ovsrec_bridge *br_int, > const struct sbrec_chassis *chassis, > @@ -1372,6 +1580,8 @@ physical_run(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > struct ofpbuf ofpacts; > ofpbuf_init(&ofpacts, 0); > > + put_chassis_mac_conj_id_flow(chassis_table, chassis, &ofpacts, > flow_table); > + > /* Set up flows in table 0 for physical-to-logical translation and in > table > * 64 for logical-to-physical translation. */ > const struct sbrec_port_binding *binding; > diff --git a/controller/physical.h b/controller/physical.h > index a85e297..c93f6b1 100644 > --- a/controller/physical.h > +++ b/controller/physical.h > @@ -46,6 +46,7 @@ void physical_register_ovs_idl(struct ovsdb_idl *); > void physical_run(struct ovsdb_idl_index *sbrec_port_binding_by_name, > const struct sbrec_multicast_group_table *, > const struct sbrec_port_binding_table *, > + const struct sbrec_chassis_table *chassis_table, > enum mf_field_id mff_ovn_geneve, > const struct ovsrec_bridge *br_int, > const struct sbrec_chassis *chassis, > diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml > index 415da59..6115e84 100644 > --- a/ovn-architecture.7.xml > +++ b/ovn-architecture.7.xml > @@ -1426,7 +1426,7 @@ > (MAC,VLAN) tuple will seen by physical network from other chassis > as > well, which could cause these issues: > </p> > - > + > <ul> > <li> > Continuous MAC moves in top-of-rack switch (ToR). > @@ -1442,9 +1442,11 @@ > > <li> > The destination chassis receives the packet via the localnet port > and > - sends it to the integration bridge. The packet enters the > - ingress pipeline and then egress pipeline of the destination > localnet > - logical switch and finally gets delivered to the destination VM > port. > + sends it to the integration bridge. Before entering the integration > + bridge the source mac of the packet will be replaced with > + router port mac again. The packet enters the ingress pipeline and > + then egress pipeline of the destination localnet logical switch and > + finally gets delivered to the destination VM port. > </li> > </ol> > > diff --git a/tests/ovn.at b/tests/ovn.at > index de1b3b3..2a35b4e 100644 > --- a/tests/ovn.at > +++ b/tests/ovn.at > @@ -14278,6 +14278,14 @@ hv_to_chassis_mac () { > esac > } > > +lrp_to_lrp_mac () { > + case $1 in dnl ( > + router-to-ls[[1]]) echo 000001010203 ;; dnl ( > + router-to-ls[[2]]) echo 000001010205 ;; dnl ( > + *) AT_FAIL_IF([:]) ;; > + esac > +} > + > ip_to_hex() { > printf "%02x%02x%02x%02x" "$@" > } > @@ -14345,7 +14353,9 @@ test_ip() { > # (and checksum). > outport_num=`vif_to_num $outport` > out_lrp=`vif_to_lrp $outport` > - echo > f000000000${outport_num}aabbccddee${hv_num}${hv_num}08004500001c00000000"3f1101"00${src_ip}${dst_ip}0035111100080000 > + lrp_mac=`lrp_to_lrp_mac $out_lrp` > + echo > f000000000${outport_num}${lrp_mac}08004500001c00000000"3f1101"00${src_ip}${dst_ip}0035111100080000 > + > fi >> $outport.expected > done > } > @@ -15720,7 +15730,7 @@ test_ip() { > out_lrp=`vif_to_lrp $outport` > # For North-South, packet will come via gateway chassis, > i.e hv3 > if test $inport = vif-north; then > - echo > f00000000011aabbccddee3308004500001c000000003f110100${src_ip}${dst_ip}0035111100080000 > >> $outport.expected > + echo > f0000000001100000101020308004500001c000000003f110100${src_ip}${dst_ip}0035111100080000 > >> $outport.expected > fi > if test $outport = vif-north; then > echo > f0f00000001100000101020708004500001c000000003e110200${src_ip}${dst_ip}0035111100080000 > >> $outport.expected > -- > 1.8.3.1 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev