The branch main has been updated by ks:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=65c318630123fcf2b6f491bf4d02a5cad3031d20

commit 65c318630123fcf2b6f491bf4d02a5cad3031d20
Author:     Kajetan Staszkiewicz <k...@freebsd.org>
AuthorDate: 2025-08-01 19:01:37 +0000
Commit:     Kajetan Staszkiewicz <k...@freebsd.org>
CommitDate: 2025-08-29 07:58:40 +0000

    pf: Add prefer-ipv6-nexthop option for route-to pools
    
    Now that pf is aware of address family of each pool address and source
    tracking uses distinct address family for source and redirection
    adddresses it is possible to add a new pool option prefer-ipv6-nexthop
    which enables routing of IPv4 packets over IPv6 next hops for rules
    with the route-to option.
    
    Add a pool option flag PF_POOL_IPV6NH, apply it to pools with a keyword
    prefer-ipv6-nexthop.
    
    Modify pf_map_addr() to handle pools with addresses of different
    families. Use *naf as a hint about what address family the forwarded
    packet is, then pick from the pool addresses of family that can be used
    as a next hop for the forwarded packet, controlled by the PF_POOL_IPV6NH
    flag. For NAT pools this flag is never set and thus pf_map_addr()
    will return an IP address of the same family as the forwarded packet.
    For route-to pools when the flag is enabled IPv6 addresses can be
    returned or IPv4 packets.
    
    In pf_route() check rt_af, it is not guaranteed to be AF_INET anymore
    because pf_map_addr() could have changed it (as *naf).
    
    Add tests for behaviour of pf_map_addr() both with PF_POOL_IPV6NH and
    without, for single IP addresses, prefixes and subnets.
    
    Reviewed by:    kp
    Sponsored by:   InnoGames GmbH
    Differential Revision:  https://reviews.freebsd.org/D50781
---
 sbin/pfctl/parse.y                   |  50 ++-
 sbin/pfctl/pfctl_parser.c            |   6 +-
 sbin/pfctl/tests/files/pf1073.in     |   1 +
 sbin/pfctl/tests/files/pf1073.ok     |   1 +
 sbin/pfctl/tests/files/pf1074.fail   |   1 +
 sbin/pfctl/tests/files/pf1074.in     |   1 +
 sbin/pfctl/tests/pfctl_test_list.inc |   2 +
 share/man/man5/pf.conf.5             |  18 +-
 sys/net/pfvar.h                      |   3 +-
 sys/netpfil/pf/if_pfsync.c           |   8 +-
 sys/netpfil/pf/pf.c                  |  63 ++-
 sys/netpfil/pf/pf.h                  |   1 +
 sys/netpfil/pf/pf_ioctl.c            |   1 +
 sys/netpfil/pf/pf_lb.c               | 151 ++++++--
 tests/sys/netpfil/pf/route_to.sh     | 716 ++++++++++++++++++++++++++++++++++-
 tests/sys/netpfil/pf/src_track.sh    |  36 +-
 tests/sys/netpfil/pf/utils.subr      |   4 +-
 17 files changed, 953 insertions(+), 110 deletions(-)

diff --git a/sbin/pfctl/parse.y b/sbin/pfctl/parse.y
index 00c36b218055..59c27d1f5d7c 100644
--- a/sbin/pfctl/parse.y
+++ b/sbin/pfctl/parse.y
@@ -238,6 +238,7 @@ static struct pool_opts {
 #define POM_TYPE               0x01
 #define POM_STICKYADDRESS      0x02
 #define POM_ENDPI              0x04
+#define POM_IPV6NH             0x08
        u_int8_t                 opts;
        int                      type;
        int                      staticport;
@@ -543,7 +544,7 @@ int parseport(char *, struct range *r, int);
 %token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW ALLOW_RELATED
 %token TAGGED TAG IFBOUND FLOATING STATEPOLICY STATEDEFAULTS ROUTE SETTOS
 %token DIVERTTO DIVERTREPLY BRIDGE_TO RECEIVEDON NE LE GE AFTO NATTO RDRTO
-%token BINATTO MAXPKTRATE MAXPKTSIZE
+%token BINATTO MAXPKTRATE MAXPKTSIZE IPV6NH
 %token <v.string>              STRING
 %token <v.number>              NUMBER
 %token <v.i>                   PORTBINARY
@@ -2648,13 +2649,16 @@ pfrule          : action dir logquick interface route 
af proto fromto
                                        YYERROR;
                                }
                                r.rt = $5.rt;
-                               decide_address_family($5.redirspec->host, 
&r.af);
-                               if (!(r.rule_flag & PFRULE_AFTO))
-                                       
remove_invalid_hosts(&($5.redirspec->host), &r.af);
-                               if ($5.redirspec->host == NULL) {
-                                       yyerror("no routing address with "
-                                           "matching address family found.");
-                                       YYERROR;
+
+                               if (!($5.redirspec->pool_opts.opts & 
PF_POOL_IPV6NH)) {
+                                       
decide_address_family($5.redirspec->host, &r.af);
+                                       if (!(r.rule_flag & PFRULE_AFTO))
+                                               
remove_invalid_hosts(&($5.redirspec->host), &r.af);
+                                       if ($5.redirspec->host == NULL) {
+                                               yyerror("no routing address 
with "
+                                                   "matching address family 
found.");
+                                               YYERROR;
+                                       }
                                }
                        }
 #ifdef __FreeBSD__
@@ -2978,7 +2982,8 @@ filter_opt        : USER uids {
 
                        filter_opts.nat = $4;
                        filter_opts.nat->af = $2;
-                       if ($4->af && $4->af != $2) {
+                       remove_invalid_hosts(&($4->host), 
&(filter_opts.nat->af));
+                       if ($4->host == NULL) {
                                yyerror("af-to addresses must be in the "
                                   "target address family");
                                YYERROR;
@@ -2998,8 +3003,9 @@ filter_opt        : USER uids {
                        filter_opts.nat->af = $2;
                        filter_opts.rdr = $6;
                        filter_opts.rdr->af = $2;
-                       if (($4->af && $4->host->af != $2) ||
-                           ($6->af && $6->host->af != $2)) {
+                       remove_invalid_hosts(&($4->host), 
&(filter_opts.nat->af));
+                       remove_invalid_hosts(&($6->host), 
&(filter_opts.rdr->af));
+                       if ($4->host == NULL || $6->host == NULL) {
                                yyerror("af-to addresses must be in the "
                                   "target address family");
                                YYERROR;
@@ -4674,6 +4680,14 @@ pool_opt : BITMASK       {
                        pool_opts.marker |= POM_ENDPI;
                        pool_opts.opts |= PF_POOL_ENDPI;
                }
+               | IPV6NH {
+                       if (pool_opts.marker & POM_IPV6NH) {
+                               yyerror("prefer-ipv6-nexthop cannot be 
redefined");
+                               YYERROR;
+                       }
+                       pool_opts.marker |= POM_IPV6NH;
+                       pool_opts.opts |= PF_POOL_IPV6NH;
+               }
                | MAPEPORTSET number '/' number '/' number {
                        if (pool_opts.mape.offset) {
                                yyerror("map-e-portset cannot be redefined");
@@ -4813,6 +4827,12 @@ natrule          : nataction interface af proto fromto 
tag tagged rtable
                                            "address'");
                                        YYERROR;
                                }
+                               if ($9->pool_opts.opts & PF_POOL_IPV6NH) {
+                                       yyerror("The prefer-ipv6-nexthop option 
"
+                                           "can't be used for nat/rdr/binat 
pools"
+                                       );
+                                       YYERROR;
+                               }
                                if (!r.af && ! $9->host->ifindex)
                                        r.af = $9->host->af;
 
@@ -5074,13 +5094,6 @@ route_host       : STRING                        {
 
 route_host_list        : route_host optnl                      { $$ = $1; }
                | route_host_list comma route_host optnl {
-                       if ($1->af == 0)
-                               $1->af = $3->af;
-                       if ($1->af != $3->af) {
-                               yyerror("all pool addresses must be in the "
-                                   "same address family");
-                               YYERROR;
-                       }
                        $1->tail->next = $3;
                        $1->tail = $3->tail;
                        $$ = $1;
@@ -6678,6 +6691,7 @@ lookup(char *s)
                { "pass",               PASS},
                { "pflow",              PFLOW},
                { "port",               PORT},
+               { "prefer-ipv6-nexthop", IPV6NH},
                { "prio",               PRIO},
                { "priority",           PRIORITY},
                { "priq",               PRIQ},
diff --git a/sbin/pfctl/pfctl_parser.c b/sbin/pfctl/pfctl_parser.c
index 18b78a150c28..3c4f9f6b4334 100644
--- a/sbin/pfctl/pfctl_parser.c
+++ b/sbin/pfctl/pfctl_parser.c
@@ -508,6 +508,8 @@ print_pool(struct pfctl_pool *pool, u_int16_t p1, u_int16_t 
p2, int id)
        if (pool->mape.offset > 0)
                printf(" map-e-portset %u/%u/%u",
                    pool->mape.offset, pool->mape.psidlen, pool->mape.psid);
+       if (pool->opts & PF_POOL_IPV6NH)
+               printf(" prefer-ipv6-nexthop");
 }
 
 void
@@ -1438,7 +1440,7 @@ ifa_add_groups_to_map(char *ifa_name)
                        ENTRY                    item;
                        ENTRY                   *ret_item;
                        int                     *answer;
-       
+
                        item.key = ifg->ifgrq_group;
                        if (hsearch_r(item, FIND, &ret_item, &isgroup_map) == 
0) {
                                struct ifgroupreq        ifgr2;
@@ -1580,7 +1582,7 @@ is_a_group(char *name)
 {
        ENTRY                    item;
        ENTRY                   *ret_item;
-       
+
        item.key = name;
        if (hsearch_r(item, FIND, &ret_item, &isgroup_map) == 0)
                return (0);
diff --git a/sbin/pfctl/tests/files/pf1073.in b/sbin/pfctl/tests/files/pf1073.in
new file mode 100644
index 000000000000..477995893ac3
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1073.in
@@ -0,0 +1 @@
+pass in on vtnet0 route-to ( vtnet1 2001:db8::1 ) prefer-ipv6-nexthop inet
diff --git a/sbin/pfctl/tests/files/pf1073.ok b/sbin/pfctl/tests/files/pf1073.ok
new file mode 100644
index 000000000000..f34867508c75
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1073.ok
@@ -0,0 +1 @@
+pass in on vtnet0 route-to (vtnet1 2001:db8::1) prefer-ipv6-nexthop inet all 
flags S/SA keep state
diff --git a/sbin/pfctl/tests/files/pf1074.fail 
b/sbin/pfctl/tests/files/pf1074.fail
new file mode 100644
index 000000000000..afe8ee3c458f
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1074.fail
@@ -0,0 +1 @@
+no routing address with matching address family found.
diff --git a/sbin/pfctl/tests/files/pf1074.in b/sbin/pfctl/tests/files/pf1074.in
new file mode 100644
index 000000000000..5d285bc5d6e8
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1074.in
@@ -0,0 +1 @@
+pass in on vtnet0 route-to ( vtnet1 2001:db8::1 ) inet
diff --git a/sbin/pfctl/tests/pfctl_test_list.inc 
b/sbin/pfctl/tests/pfctl_test_list.inc
index 3a68cc06ec74..8bfccddf50e5 100644
--- a/sbin/pfctl/tests/pfctl_test_list.inc
+++ b/sbin/pfctl/tests/pfctl_test_list.inc
@@ -181,3 +181,5 @@ PFCTL_TEST(1069, "max-pkt-size")
 PFCTL_TEST_FAIL(1070, "include line number")
 PFCTL_TEST(1071, "mask length on (lo0)")
 PFCTL_TEST_FAIL(1072, "Invalid port range")
+PFCTL_TEST(1073, "Filter AF different than route-to AF, with 
prefer-ipv6-nexthop")
+PFCTL_TEST_FAIL(1074, "Filter AF different than route-to AF, without 
prefer-ipv6-nexthop")
diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
index a9ae823257a4..bdd8a843d72a 100644
--- a/share/man/man5/pf.conf.5
+++ b/share/man/man5/pf.conf.5
@@ -2470,7 +2470,13 @@ NAT address and port.
 This feature implements "full-cone" NAT behavior.
 .El
 .Pp
-Additionally, the
+Additionally, options
+.Ar sticky-address
+and
+.Ar prefer-ipv6-nexthop
+can be specified to influence how IP addresses selected from pools.
+.Pp
+The
 .Ar sticky-address
 option can be specified to help ensure that multiple connections from the
 same source are mapped to the same redirection address.
@@ -2486,6 +2492,14 @@ beyond the lifetime of the states, increase the global 
options with
 See
 .Sx STATEFUL TRACKING OPTIONS
 for more ways to control the source tracking.
+.Pp
+The
+.Ar prefer-ipv6-nexthop
+option allows for IPv6 addresses to be used as the nexthop
+for IPv4 packets routed with the
+.Ar route-to
+rule option. If a table is used with IPv4 and IPv6 addresses, first the IPv6 
addresses
+will be used in round-robin fashion, then IPv4 addresses.
 .Sh STATE MODULATION
 Much of the security derived from TCP is attributable to how well the
 initial sequence numbers (ISNs) are chosen.
@@ -3580,7 +3594,7 @@ limit-item     = ( "states" | "frags" | "src-nodes" ) 
number
 
 pooltype       = ( "bitmask" | "random" |
                  "source-hash" [ ( hex-key | string-key ) ] |
-                 "round-robin" ) [ sticky-address ]
+                 "round-robin" ) [ sticky-address | prefer-ipv6-nexthop ]
 
 subqueue       = string | "{" queue-list "}"
 queue-list     = string [ [ "," ] string ]
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index d6c13470f2eb..cf6d2508cf65 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -645,6 +645,7 @@ struct pf_kpool {
        int                      tblidx;
        u_int16_t                proxy_port[2];
        u_int8_t                 opts;
+       sa_family_t              ipv6_nexthop_af;
 };
 
 struct pf_rule_actions {
@@ -2680,7 +2681,7 @@ u_short                    pf_map_addr(sa_family_t, 
struct pf_krule *,
                            struct pf_addr *, struct pf_kpool *);
 u_short                         pf_map_addr_sn(u_int8_t, struct pf_krule *,
                            struct pf_addr *, struct pf_addr *,
-                           sa_family_t *, struct pfi_kkif **nkif,
+                           sa_family_t *, struct pfi_kkif **,
                            struct pf_addr *, struct pf_kpool *,
                            pf_sn_types_t);
 int                     pf_get_transaddr_af(struct pf_krule *,
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index 585c196391c0..cfc300d99396 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -605,7 +605,8 @@ pfsync_state_import(union pfsync_state_union *sp, int 
flags, int msg_version)
                        rt_kif = rpool_first->kif;
                        /*
                         * Guess the AF of the route address, FreeBSD 13 does
-                        * not support af-to so it should be safe.
+                        * not support af-to nor prefer-ipv6-nexthop
+                        * so it should be safe.
                         */
                        rt_af = r->af;
                } else if (!PF_AZERO(&sp->pfs_1301.rt_addr, sp->pfs_1301.af)) {
@@ -634,8 +635,9 @@ pfsync_state_import(union pfsync_state_union *sp, int 
flags, int msg_version)
                        }
                        rt = sp->pfs_1400.rt;
                        /*
-                        * Guess the AF of the route address, FreeBSD 13 does
-                        * not support af-to so it should be safe.
+                        * Guess the AF of the route address, FreeBSD 14 does
+                        * not support af-to nor prefer-ipv6-nexthop
+                        * so it should be safe.
                         */
                        rt_af = sp->pfs_1400.af;
                }
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index 8cd4fff95b15..4325835c7671 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -5960,7 +5960,9 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
        if (r->rt) {
                /*
                 * Set act.rt here instead of in pf_rule_to_actions() because
-                * it is applied only from the last pass rule.
+                * it is applied only from the last pass rule. For rules
+                * with the prefer-ipv6-nexthop option act.rt_af is a hint
+                * about AF of the forwarded packet and might be changed.
                 */
                pd->act.rt = r->rt;
                if (r->rt == PF_REPLYTO)
@@ -8974,9 +8976,10 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
     struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
 {
        struct mbuf             *m0, *m1, *md;
-       struct route             ro;
-       const struct sockaddr   *gw = &ro.ro_dst;
-       struct sockaddr_in      *dst;
+       struct route_in6         ro;
+       union sockaddr_union     rt_gw;
+       const union sockaddr_union      *gw = (const union sockaddr_union 
*)&ro.ro_dst;
+       union sockaddr_union    *dst;
        struct ip               *ip;
        struct ifnet            *ifp = NULL;
        int                      error = 0;
@@ -9071,10 +9074,35 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
        ip = mtod(m0, struct ip *);
 
        bzero(&ro, sizeof(ro));
-       dst = (struct sockaddr_in *)&ro.ro_dst;
-       dst->sin_family = AF_INET;
-       dst->sin_len = sizeof(struct sockaddr_in);
-       dst->sin_addr.s_addr = pd->act.rt_addr.v4.s_addr;
+       dst = (union sockaddr_union *)&ro.ro_dst;
+       dst->sin.sin_family = AF_INET;
+       dst->sin.sin_len = sizeof(struct sockaddr_in);
+       dst->sin.sin_addr = ip->ip_dst;
+       if (ifp) { /* Only needed in forward direction and route-to */
+               bzero(&rt_gw, sizeof(rt_gw));
+               ro.ro_flags |= RT_HAS_GW;
+               gw = &rt_gw;
+               switch (pd->act.rt_af) {
+#ifdef INET
+               case AF_INET:
+                       rt_gw.sin.sin_family = AF_INET;
+                       rt_gw.sin.sin_len = sizeof(struct sockaddr_in);
+                       rt_gw.sin.sin_addr.s_addr = pd->act.rt_addr.v4.s_addr;
+                       break;
+#endif /* INET */
+#ifdef INET6
+               case AF_INET6:
+                       rt_gw.sin6.sin6_family = AF_INET6;
+                       rt_gw.sin6.sin6_len = sizeof(struct sockaddr_in6);
+                       pf_addrcpy((struct pf_addr *)&rt_gw.sin6.sin6_addr,
+                           &pd->act.rt_addr, AF_INET6);
+                       break;
+#endif /* INET6 */
+               default:
+                       /* Normal af-to without route-to */
+                       break;
+               }
+       }
 
        if (pd->dir == PF_IN) {
                if (ip->ip_ttl <= IPTTLDEC) {
@@ -9098,10 +9126,10 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 
                                /* Use the gateway if needed. */
                                if (nh->nh_flags & NHF_GATEWAY) {
-                                       gw = &nh->gw_sa;
+                                       gw = (const union sockaddr_union 
*)&nh->gw_sa;
                                        ro.ro_flags |= RT_HAS_GW;
                                } else {
-                                       dst->sin_addr = ip->ip_dst;
+                                       dst->sin.sin_addr = ip->ip_dst;
                                }
 
                                /*
@@ -9126,6 +9154,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
                PF_STATE_UNLOCK(s);
        }
 
+       /* It must have been either set from rt_af or from fib4_lookup */
+       KASSERT(gw->sin.sin_family != 0, ("%s: gw address family undetermined", 
__func__));
+
        if (ifp == NULL) {
                m0 = pd->m;
                pd->m = NULL;
@@ -9210,9 +9241,11 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
                m_clrprotoflags(m0);    /* Avoid confusing lower layers. */
 
                md = m0;
-               error = pf_dummynet_route(pd, s, r, ifp, gw, &md);
+               error = pf_dummynet_route(pd, s, r, ifp,
+                   (const struct sockaddr *)gw, &md);
                if (md != NULL) {
-                       error = (*ifp->if_output)(ifp, md, gw, &ro);
+                       error = (*ifp->if_output)(ifp, md,
+                           (const struct sockaddr *)gw, (struct route *)&ro);
                        SDT_PROBE2(pf, ip, route_to, output, ifp, error);
                }
                goto done;
@@ -9253,9 +9286,11 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
                        md = m0;
                        pd->pf_mtag = pf_find_mtag(md);
                        error = pf_dummynet_route(pd, s, r, ifp,
-                           gw, &md);
+                           (const struct sockaddr *)gw, &md);
                        if (md != NULL) {
-                               error = (*ifp->if_output)(ifp, md, gw, &ro);
+                               error = (*ifp->if_output)(ifp, md,
+                                   (const struct sockaddr *)gw,
+                                   (struct route *)&ro);
                                SDT_PROBE2(pf, ip, route_to, output, ifp, 
error);
                        }
                } else
diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h
index 51b3fd6390e1..8edd5a5110a1 100644
--- a/sys/netpfil/pf/pf.h
+++ b/sys/netpfil/pf/pf.h
@@ -131,6 +131,7 @@ enum        { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, 
PF_ADDR_DYNIFTL,
 #define PF_POOL_TYPEMASK       0x0f
 #define PF_POOL_STICKYADDR     0x20
 #define PF_POOL_ENDPI          0x40
+#define PF_POOL_IPV6NH         0x80
 #define        PF_WSCALE_FLAG          0x80
 #define        PF_WSCALE_MASK          0x0f
 
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index e5da05a958f6..d395730d6a54 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -2276,6 +2276,7 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
        rule->nat.cur = TAILQ_FIRST(&rule->nat.list);
        rule->rdr.cur = TAILQ_FIRST(&rule->rdr.list);
        rule->route.cur = TAILQ_FIRST(&rule->route.list);
+       rule->route.ipv6_nexthop_af = AF_INET6;
        TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
            rule, entries);
        ruleset->rules[rs_num].inactive.rcount++;
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index bc9e1dc72902..b8b5157c9b15 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -545,11 +545,18 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
        uint64_t                 hashidx;
        int                      cnt;
        sa_family_t              wanted_af;
+       u_int8_t                 pool_type;
+       bool                     prefer_ipv6_nexthop = rpool->opts & 
PF_POOL_IPV6NH;
 
        KASSERT(saf != 0, ("%s: saf == 0", __func__));
        KASSERT(naf != NULL, ("%s: naf = NULL", __func__));
        KASSERT((*naf) != 0, ("%s: *naf = 0", __func__));
 
+       /*
+        * Given (*naf) is a hint about AF of the forwarded packet.
+        * It might be changed if prefer_ipv6_nexthop is enabled and
+        * the combination of nexthop AF and packet AF allows for it.
+        */
        wanted_af = (*naf);
 
        mtx_lock(&rpool->mtx);
@@ -594,19 +601,38 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
        } else {
                raddr = &rpool->cur->addr.v.a.addr;
                rmask = &rpool->cur->addr.v.a.mask;
-               /*
-                * For single addresses check their address family. Unless they
-                * have none, which happens when addresses are added with
-                * the old ioctl mechanism. In such case trust that the address
-                * has the proper AF.
-                */
-               if (rpool->cur->af && rpool->cur->af != wanted_af) {
-                       reason = PFRES_MAPFAILED;
-                       goto done_pool_mtx;
+       }
+
+       /*
+        * For pools with a single host with the prefer-ipv6-nexthop option
+        * we can return pool address of any AF, unless the forwarded packet
+        * is IPv6, then we can return only if pool address is IPv6.
+        * For non-prefer-ipv6-nexthop we can return pool address only
+        * of wanted AF, unless the pool address'es AF is unknown, which
+        * happens in case old ioctls have been used to set up the pool.
+        *
+        * Round-robin pools have their own logic for retrying next addresses.
+        */
+       pool_type = rpool->opts & PF_POOL_TYPEMASK;
+       if (pool_type == PF_POOL_NONE || pool_type == PF_POOL_BITMASK ||
+           ((pool_type == PF_POOL_RANDOM || pool_type == PF_POOL_SRCHASH) &&
+           rpool->cur->addr.type != PF_ADDR_TABLE &&
+           rpool->cur->addr.type != PF_ADDR_DYNIFTL)) {
+               if (prefer_ipv6_nexthop) {
+                       if (rpool->cur->af == AF_INET && (*naf) == AF_INET6) {
+                               reason = PFRES_MAPFAILED;
+                               goto done_pool_mtx;
+                       }
+                       wanted_af = rpool->cur->af;
+               } else {
+                       if (rpool->cur->af != 0 && rpool->cur->af != (*naf)) {
+                               reason = PFRES_MAPFAILED;
+                               goto done_pool_mtx;
+                       }
                }
        }
 
-       switch (rpool->opts & PF_POOL_TYPEMASK) {
+       switch (pool_type) {
        case PF_POOL_NONE:
                pf_addrcpy(naddr, raddr, wanted_af);
                break;
@@ -631,10 +657,22 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
                        else
                                rpool->tblidx = (int)arc4random_uniform(cnt);
                        memset(&rpool->counter, 0, sizeof(rpool->counter));
+                       if (prefer_ipv6_nexthop)
+                               wanted_af = AF_INET6;
+               retry_other_af_random:
                        if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
                            wanted_af, pf_islinklocal, false)) {
-                               reason = PFRES_MAPFAILED;
-                               goto done_pool_mtx; /* unsupported */
+                               /* Retry with IPv4 nexthop for IPv4 traffic */
+                               if (prefer_ipv6_nexthop &&
+                                   wanted_af == AF_INET6 &&
+                                   (*naf) == AF_INET) {
+                                       wanted_af = AF_INET;
+                                       goto retry_other_af_random;
+                               } else {
+                                        /* no hosts in wanted AF */
+                                       reason = PFRES_MAPFAILED;
+                                       goto done_pool_mtx;
+                               }
                        }
                        pf_addrcpy(naddr, &rpool->counter, wanted_af);
                } else if (init_addr != NULL && PF_AZERO(init_addr,
@@ -702,10 +740,22 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
                        else
                                rpool->tblidx = (int)(hashidx % cnt);
                        memset(&rpool->counter, 0, sizeof(rpool->counter));
+                       if (prefer_ipv6_nexthop)
+                               wanted_af = AF_INET6;
+               retry_other_af_srchash:
                        if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
                            wanted_af, pf_islinklocal, false)) {
-                               reason = PFRES_MAPFAILED;
-                               goto done_pool_mtx; /* unsupported */
+                               /* Retry with IPv4 nexthop for IPv4 traffic */
+                               if (prefer_ipv6_nexthop &&
+                                   wanted_af == AF_INET6 &&
+                                   (*naf) == AF_INET) {
+                                       wanted_af = AF_INET;
+                                       goto retry_other_af_srchash;
+                               } else {
+                                        /* no hosts in wanted AF */
+                                       reason = PFRES_MAPFAILED;
+                                       goto done_pool_mtx;
+                               }
                        }
                        pf_addrcpy(naddr, &rpool->counter, wanted_af);
                } else {
@@ -718,6 +768,9 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
            {
                struct pf_kpooladdr *acur = rpool->cur;
 
+       retry_other_af_rr:
+               if (prefer_ipv6_nexthop)
+                       wanted_af = rpool->ipv6_nexthop_af;
                if (rpool->cur->addr.type == PF_ADDR_TABLE) {
                        if (!pfr_pool_get(rpool->cur->addr.p.tbl,
                            &rpool->tblidx, &rpool->counter, wanted_af,
@@ -728,46 +781,55 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
                            &rpool->tblidx, &rpool->counter, wanted_af,
                            pf_islinklocal, true))
                                goto get_addr;
-               } else if (pf_match_addr(0, raddr, rmask, &rpool->counter,
-                   wanted_af))
+               } else if (rpool->cur->af == wanted_af &&
+                   pf_match_addr(0, raddr, rmask, &rpool->counter, wanted_af))
                        goto get_addr;
-
+               if (prefer_ipv6_nexthop &&
+                   (*naf) == AF_INET && wanted_af == AF_INET6) {
+                       /* Reset table index when changing wanted AF. */
+                       rpool->tblidx = -1;
+                       rpool->ipv6_nexthop_af = AF_INET;
+                       goto retry_other_af_rr;
+               }
        try_next:
+               /* Reset prefer-ipv6-nexthop search to IPv6 when iterating 
pools. */
+               rpool->ipv6_nexthop_af = AF_INET6;
                if (TAILQ_NEXT(rpool->cur, entries) == NULL)
                        rpool->cur = TAILQ_FIRST(&rpool->list);
                else
                        rpool->cur = TAILQ_NEXT(rpool->cur, entries);
+       try_next_ipv6_nexthop_rr:
+               /* Reset table index when iterating pools or changing wanted 
AF. */
                rpool->tblidx = -1;
+               if (prefer_ipv6_nexthop)
+                       wanted_af = rpool->ipv6_nexthop_af;
                if (rpool->cur->addr.type == PF_ADDR_TABLE) {
-                       if (pfr_pool_get(rpool->cur->addr.p.tbl,
+                       if (!pfr_pool_get(rpool->cur->addr.p.tbl,
                            &rpool->tblidx, &rpool->counter, wanted_af, NULL,
-                           true)) {
-                               /* table contains no address of type 
'wanted_af' */
-                               if (rpool->cur != acur)
-                                       goto try_next;
-                               reason = PFRES_MAPFAILED;
-                               goto done_pool_mtx;
-                       }
+                           true))
+                               goto get_addr;
                } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
-                       if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
-                           &rpool->tblidx, &rpool->counter, wanted_af,
-                           pf_islinklocal, true)) {
-                               /* interface has no address of type 'wanted_af' 
*/
-                               if (rpool->cur != acur)
-                                       goto try_next;
-                               reason = PFRES_MAPFAILED;
-                               goto done_pool_mtx;
-                       }
+                       if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
+                           &rpool->tblidx, &rpool->counter, wanted_af, 
pf_islinklocal,
+                           true))
+                               goto get_addr;
                } else {
-                       raddr = &rpool->cur->addr.v.a.addr;
-                       rmask = &rpool->cur->addr.v.a.mask;
-                       if (rpool->cur->af && rpool->cur->af != wanted_af) {
-                               reason = PFRES_MAPFAILED;
-                               goto done_pool_mtx;
+                       if (rpool->cur->af == wanted_af) {
+                               raddr = &rpool->cur->addr.v.a.addr;
+                               rmask = &rpool->cur->addr.v.a.mask;
+                               pf_addrcpy(&rpool->counter, raddr, wanted_af);
+                               goto get_addr;
                        }
-                       pf_addrcpy(&rpool->counter, raddr, wanted_af);
                }
-
+               if (prefer_ipv6_nexthop &&
+                   (*naf) == AF_INET && wanted_af == AF_INET6) {
+                       rpool->ipv6_nexthop_af = AF_INET;
+                       goto try_next_ipv6_nexthop_rr;
+               }
+               if (rpool->cur != acur)
+                       goto try_next;
+               reason = PFRES_MAPFAILED;
+               goto done_pool_mtx;
        get_addr:
                pf_addrcpy(naddr, &rpool->counter, wanted_af);
                if (init_addr != NULL && PF_AZERO(init_addr, wanted_af))
@@ -777,9 +839,16 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct 
pf_addr *saddr,
            }
        }
 
+       if (wanted_af == 0) {
+               reason = PFRES_MAPFAILED;
+               goto done_pool_mtx;
+       }
+
        if (nkif)
                *nkif = rpool->cur->kif;
 
+       (*naf) = wanted_af;
+
 done_pool_mtx:
        mtx_unlock(&rpool->mtx);
 
diff --git a/tests/sys/netpfil/pf/route_to.sh b/tests/sys/netpfil/pf/route_to.sh
index 765403dcb79c..872de0dcbb91 100644
--- a/tests/sys/netpfil/pf/route_to.sh
+++ b/tests/sys/netpfil/pf/route_to.sh
@@ -28,6 +28,75 @@
 
 common_dir=$(atf_get_srcdir)/../common
 
+# We need to somehow test if the random algorithm of pf_map_addr() is working.
+# The table or prefix contains multiple IP next-hop addresses, for each one try
+# to establish up to 10 connections. Fail the test if with this many attempts
+# the "good" target has not been chosen. However this choice is random,
+# the check might still ocasionally fail.
+check_random() {
+       if [ "$1" = "IPv4" ]; then
+               ping_from="${net_clients_4}.1"
+               ping_to="${host_server_4}"
+       else
+               ping_from="${net_clients_6}::1"
+               ping_to="${host_server_6}"
+       fi
+       good_targets="$2"
+       bad_targets="$3"
+
+       port=42000
+       states=$(mktemp) || exit 1
+       for good_target in $good_targets; do
+               found="no"
+               for attempt in $(seq 1 10); do
+                       port=$(( port + 1 ))
+                       jexec router pfctl -Fs
+                       atf_check -s exit:0 ${common_dir}/pft_ping.py \
+                               --sendif ${epair_tester}a --replyif 
${epair_tester}a \
+                               --fromaddr ${ping_from} --to ${ping_to} \
+                               --ping-type=tcp3way --send-sport=${port}
+                       jexec router pfctl -qvvss | normalize_pfctl_s > $states
+                       cat $states
+                       if [ -n "${bad_targets}" ]; then
+                               for bad_target in $bad_targets; do
+                                       if grep -qE "route-to: ${bad_target}@" 
$states; then
+                                               atf_fail "Bad target 
${bad_target} selected!"
+                                       fi
+                               done
+                       fi;
+                       if grep -qE "route-to: ${good_target}@" $states; then
+                               found=yes
+                               break
+                       fi
+               done
+               if [ "${found}" = "no" ]; then
+                       atf_fail "Target ${good_target} not selected after 
${attempt} attempts!"
+               fi
+       done
+}
+
+pf_map_addr_common()
+{
+       setup_router_server_nat64
+
+       # Clients will connect from another network behind the router.
+       # This allows for using multiple source addresses.
+       jexec router route add -6 ${net_clients_6}::/${net_clients_6_mask} 
${net_tester_6_host_tester}
+       jexec router route add    ${net_clients_4}.0/${net_clients_4_mask} 
${net_tester_4_host_tester}
+
+       # The servers are reachable over additional IP addresses for
+       # testing of tables and subnets. The addresses are noncontinougnus
+       # for pf_map_addr() counter tests.
+       for i in 0 1 4 5; do
+               a1=$((24 + i))
+               jexec server1 ifconfig ${epair_server1}b inet  
${net_server1_4}.${a1}/32 alias
+               jexec server1 ifconfig ${epair_server1}b inet6 
${net_server1_6}::42:${i}/128 alias
+               a2=$((40 + i))
+               jexec server2 ifconfig ${epair_server2}b inet  
${net_server2_4}.${a2}/32 alias
+               jexec server2 ifconfig ${epair_server2}b inet6 
${net_server2_6}::42:${i}/128 alias
+       done
+}
+
 atf_test_case "v4" "cleanup"
 v4_head()
 {
@@ -893,36 +962,17 @@ empty_pool_cleanup()
        pft_cleanup
 }
 
-
 atf_test_case "table_loop" "cleanup"
 
 table_loop_head()
 {
        atf_set descr 'Check that iterating over tables poperly loops'
        atf_set require.user root
-       atf_set require.progs python3 scapy
 }
 
 table_loop_body()
 {
-       setup_router_server_nat64
-
-       # Clients will connect from another network behind the router.
-       # This allows for using multiple source addresses.
-       jexec router route add -6 ${net_clients_6}::/${net_clients_6_mask} 
${net_tester_6_host_tester}
-       jexec router route add    ${net_clients_4}.0/${net_clients_4_mask} 
${net_tester_4_host_tester}
-
-       # The servers are reachable over additional IP addresses for
-       # testing of tables and subnets. The addresses are noncontinougnus
-       # for pf_map_addr() counter tests.
-       for i in 0 1 4 5; do
-               a1=$((24 + i))
-               jexec server1 ifconfig ${epair_server1}b inet  
${net_server1_4}.${a1}/32 alias
-               jexec server1 ifconfig ${epair_server1}b inet6 
${net_server1_6}::42:${i}/128 alias
-               a2=$((40 + i))
-               jexec server2 ifconfig ${epair_server2}b inet  
${net_server2_4}.${a2}/32 alias
-               jexec server2 ifconfig ${epair_server2}b inet6 
${net_server2_6}::42:${i}/128 alias
-       done
+       pf_map_addr_common
 
        jexec router pfctl -e
        pft_set_rules router \
@@ -976,6 +1026,612 @@ table_loop_cleanup()
 }
 
 
+atf_test_case "roundrobin" "cleanup"
+
+roundrobin_head()
+{
+       atf_set descr 'multiple gateways of mixed AF, including prefixes and 
tables, for IPv6 packets'
+       atf_set require.user root
+}
+
+roundrobin_body()
+{
+       pf_map_addr_common
+
+       # The rule is defined as "inet6 proto tcp" so directly given IPv4 hosts
+       # will be removed from the pool by pfctl. Tables will still be loaded
+       # and pf_map_addr() will only use IPv6 addresses from them. It will
+       # iterate over members of the pool and inside of tables and prefixes.
+
+       jexec router pfctl -e
+       pft_set_rules router \
+               "set debug loud" \
+               "set reassemble yes" \
+               "set state-policy if-bound" \
+               "table <rt_targets> { ${net_server2_4}.40/31 
${net_server2_4}.44 ${net_server2_6}::42:0/127 ${net_server2_6}::42:4 }" \
+               "pass in on ${epair_tester}b \
+                       route-to { \
+                               (${epair_server1}a 
${net_server1_4_host_server}) \
+                               (${epair_server2}a <rt_targets_empty>) \
+                               (${epair_server1}a ${net_server1_6}::42:0/127) \
+                               (${epair_server2}a <rt_targets_empty>) \
+                               (${epair_server2}a <rt_targets>) \
+                       } \
+                       inet6 proto tcp \
+                       keep state"
+
+       for port in $(seq 1 6); do
+               port=$((4200 + port))
+               atf_check -s exit:0 ${common_dir}/pft_ping.py \
+                       --sendif ${epair_tester}a --replyif ${epair_tester}a \
+                       --fromaddr ${net_clients_6}::1 --to ${host_server_6} \
+                       --ping-type=tcp3way --send-sport=${port}
+       done
+
+       states=$(mktemp) || exit 1
+       jexec router pfctl -qvvss | normalize_pfctl_s > $states
+
+       for state_regexp in \
+               "${epair_tester}b tcp ${host_server_6}\[9\] <- 
${net_clients_6}::1\[4201\] .* route-to: 
${net_server1_6}::42:0@${epair_server1}a" \
+               "${epair_tester}b tcp ${host_server_6}\[9\] <- 
${net_clients_6}::1\[4202\] .* route-to: 
${net_server1_6}::42:1@${epair_server1}a" \
+               "${epair_tester}b tcp ${host_server_6}\[9\] <- 
${net_clients_6}::1\[4203\] .* route-to: 
${net_server2_6}::42:0@${epair_server2}a" \
+               "${epair_tester}b tcp ${host_server_6}\[9\] <- 
${net_clients_6}::1\[4204\] .* route-to: 
${net_server2_6}::42:1@${epair_server2}a" \
+               "${epair_tester}b tcp ${host_server_6}\[9\] <- 
${net_clients_6}::1\[4205\] .* route-to: 
${net_server2_6}::42:4@${epair_server2}a" \
+               "${epair_tester}b tcp ${host_server_6}\[9\] <- 
${net_clients_6}::1\[4206\] .* route-to: 
${net_server1_6}::42:0@${epair_server1}a" \
+       ; do
+               grep -qE "${state_regexp}" $states || atf_fail "State not found 
for '${state_regexp}'"
+       done
+}
+
+roundrobin_cleanup()
+{
+       pft_cleanup
+}
+
+atf_test_case "random_table" "cleanup"
+
+random_table_head()
+{
+       atf_set descr 'Pool with random flag and a table for IPv6'
+       atf_set require.user root
+}
+
+random_table_body()
+{
+       pf_map_addr_common
+
+       # The "random" flag will pick random hosts from the table but will
+       # not dive into prefixes, always choosing the 0th address.
+       # Proper address family will be choosen.
+
+       jexec router pfctl -e
+       pft_set_rules router \
+               "set debug loud" \
+               "set reassemble yes" \
+               "set state-policy if-bound" \
+               "table <rt_targets> { ${net_server2_4}.40/31 
${net_server2_4}.44 ${net_server2_6}::42:0/127 ${net_server2_6}::42:4 }" \
+               "pass in on ${epair_tester}b \
+                       route-to { (${epair_server2}a <rt_targets>) } random \
+                       inet6 proto tcp \
+                       keep state"
+
+       good_targets="${net_server2_6}::42:0 ${net_server2_6}::42:4"
+       bad_targets="${net_server2_6}::42:1"
+       check_random IPv6 "${good_targets}" "${bad_targets}"
+}
+
+random_table_cleanup()
+{
+       pft_cleanup
+}
+
+atf_test_case "random_prefix" "cleanup"
+
+random_prefix_head()
+{
+       atf_set descr 'Pool with random flag and a table for IPv4'
+       atf_set require.user root
+}
+
+random_prefix_body()
+{
+       pf_map_addr_common
+
+       # The "random" flag will pick random hosts from given prefix.
+       # The choice being random makes testing it non-trivial. We do 10
+       # attempts to have each target chosen. Hopefully this is enough to have
+       # this test pass often enough.
+
+       jexec router pfctl -e
+       pft_set_rules router \
+               "set debug loud" \
+               "set reassemble yes" \
+               "set state-policy if-bound" \
+               "pass in on ${epair_tester}b \
+                       route-to { (${epair_server2}a 
${net_server2_6}::42:0/127) } random \
+                       inet6 proto tcp \
+                       keep state"
+
+       good_targets="${net_server2_6}::42:0 ${net_server2_6}::42:1"
+       check_random IPv6 "${good_targets}"
+}
+
+random_prefix_cleanup()
+{
+       pft_cleanup
+}
+
+atf_test_case "prefer_ipv6_nexthop_single_ipv4" "cleanup"
+
+prefer_ipv6_nexthop_single_ipv4_head()
+{
+       atf_set descr 'prefer-ipv6-nexthop option for a single IPv4 gateway'
+       atf_set require.user root
+}
+
+prefer_ipv6_nexthop_single_ipv4_body()
+{
+       pf_map_addr_common
+
+       # Basic forwarding test for prefer-ipv6-nexthop pool option.
+       # A single IPv4 gateway will work only for IPv4 traffic.
+
+       jexec router pfctl -e
+       pft_set_rules router \
+               "set reassemble yes" \
*** 565 LINES SKIPPED ***

Reply via email to