Hi,

This looks like a hack for a problem that should not exist.

What is the MTU of the outgoing interface on your pf router?  If
the layer 2 switches do not support 9k jumbo frames, it must be
1500.

Why are the outgoing packets not fragmented to the MTU?  Is the
dont-fragment flag set?  Does pf preserve the DF flag when reassembling
and forwarding?

I think we must clear the DF of reassembled forwarded packets.  Do
you see fragments with DF?

If we have fragments with DF and reassemble them, we should have
the logic like you suggest.  Then we are basically in the same
sitution as IPv6 and have to preserve the fragment length for path
MTU discovery.

I have not seen this in the last 10 years.  And to reduce complexity
I would prefer to clear the DF instead.  Have you tried the no-df
option?

If your solution fixes a DF problem, and makes the no-df option
superfluous, and works out of the box, we can consider implementing
it for DF packets.

bluhm

On Tue, Aug 31, 2021 at 10:56:34PM +1000, David Gwynne wrote:
> i am in an annoying situation where i peer with a campus network on an
> ospf link with a 9k mtu, but some corners of that network have layer 2
> hops that don't support 9k packets. i sometimes want to tunnel large
> (1500 byte) packets to hosts in those corners of the network by
> letting the encapsulation protocol fragment. the tunnel endpoint
> will then reassemble the packet and forward the full sized frame
> as if nothing untoward happened.
> 
> the problem is that pf on the ospf hop "helps" by reassembling these
> fragmented tunnel packets before sending them out the 9k ospf link.
> the layer 2 hops then drop the packet because it's too big. i do
> want pf to reassemble the packets so it can check it, but i also
> want it to refragment it again afterward.
> 
> it turns out this is something that happens for ipv6 already, because
> fragmentation in v6 is only supposed to be done by the endpoints. this
> diff allows this same semantic for v4 packets if requested. to enable
> it, configure "set reassemble yes refragment" in your pf.conf and it
> will do the same for v4 that it does for v6.
> 
> i've only tested this lightly and now i need sleep. anyone have any
> thoughts on this?
> 
> note that m_tag_find is really cheap if the tag doesnt exist thanks to
> henning@.
> 
> Index: sys/net/pf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pf.c,v
> retrieving revision 1.1122
> diff -u -p -r1.1122 pf.c
> --- sys/net/pf.c      7 Jul 2021 18:38:25 -0000       1.1122
> +++ sys/net/pf.c      31 Aug 2021 12:42:51 -0000
> @@ -6049,6 +6049,7 @@ void
>  pf_route(struct pf_pdesc *pd, struct pf_state *s)
>  {
>       struct mbuf             *m0;
> +     struct m_tag            *mtag;
>       struct mbuf_list         fml;
>       struct sockaddr_in      *dst, sin;
>       struct rtentry          *rt = NULL;
> @@ -6132,6 +6133,15 @@ pf_route(struct pf_pdesc *pd, struct pf_
>               ip = mtod(m0, struct ip *);
>       }
>  
> +     /*
> +      * If packet has been reassembled by PF earlier, we might have to
> +      * use pf_refragment4() here to turn it back to fragments.
> +      */
> +     if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) {
> +             (void) pf_refragment4(&m0, mtag, dst, ifp, rt);
> +             goto done;
> +     }
> +
>       in_proto_cksum_out(m0, ifp);
>  
>       if (ntohs(ip->ip_len) <= ifp->if_mtu) {
> @@ -7357,16 +7367,20 @@ done:
>               break;
>       }
>  
> -#ifdef INET6
>       /* if reassembled packet passed, create new fragments */
> -     if (pf_status.reass && action == PF_PASS && pd.m && fwdir == PF_FWD &&
> -         pd.af == AF_INET6) {
> +     if (pf_status.reass && action == PF_PASS && pd.m && fwdir == PF_FWD) {
>               struct m_tag    *mtag;
>  
> -             if ((mtag = m_tag_find(pd.m, PACKET_TAG_PF_REASSEMBLED, NULL)))
> +             mtag = m_tag_find(pd.m, PACKET_TAG_PF_REASSEMBLED, NULL);
> +             if (mtag == NULL)
> +                     ; /* no reassembly required */
> +#ifdef INET6
> +             else if (pd.af == AF_INET6)
>                       action = pf_refragment6(&pd.m, mtag, NULL, NULL, NULL);
> -     }
>  #endif       /* INET6 */
> +             else
> +                     action = pf_refragment4(&pd.m, mtag, NULL, NULL, NULL);
> +     }
>       if (s && action != PF_DROP) {
>               if (!s->if_index_in && dir == PF_IN)
>                       s->if_index_in = ifp->if_index;
> Index: sys/net/pf_norm.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pf_norm.c,v
> retrieving revision 1.223
> diff -u -p -r1.223 pf_norm.c
> --- sys/net/pf_norm.c 10 Mar 2021 10:21:48 -0000      1.223
> +++ sys/net/pf_norm.c 31 Aug 2021 12:42:51 -0000
> @@ -782,7 +782,7 @@ pf_reassemble(struct mbuf **m0, int dir,
>       struct pf_frent         *frent;
>       struct pf_fragment      *frag;
>       struct pf_frnode         key;
> -     u_int16_t                total, hdrlen;
> +     u_int16_t                total, maxlen, hdrlen;
>  
>       /* Get an entry for the fragment queue */
>       if ((frent = pf_create_fragment(reason)) == NULL)
> @@ -821,6 +821,7 @@ pf_reassemble(struct mbuf **m0, int dir,
>       /* We have all the data */
>       frent = TAILQ_FIRST(&frag->fr_queue);
>       KASSERT(frent != NULL);
> +     maxlen = frag->fr_maxlen;
>       total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
>           TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
>       hdrlen = frent->fe_hdrlen;
> @@ -843,6 +844,63 @@ pf_reassemble(struct mbuf **m0, int dir,
>  
>       PF_FRAG_UNLOCK();
>       DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip->ip_len));
> +
> +     if (ISSET(pf_status.reass, PF_REASS_REFRAG)) {
> +             struct m_tag *mtag;
> +             struct pf_fragment_tag *ftag;
> +
> +             mtag = m_tag_get(PACKET_TAG_PF_REASSEMBLED, sizeof(*ftag),
> +                 M_NOWAIT);
> +             if (mtag == NULL) {
> +                     REASON_SET(reason, PFRES_MEMORY);
> +                     return (PF_DROP);
> +             }
> +
> +             ftag = (struct pf_fragment_tag *)(mtag + 1);
> +             ftag->ft_hdrlen = hdrlen;
> +             ftag->ft_maxlen = maxlen;
> +             m_tag_prepend(m, mtag);
> +     }
> +
> +     return (PF_PASS);
> +}
> +
> +int
> +pf_refragment4(struct mbuf **m0, struct m_tag *mtag, struct sockaddr_in *dst,
> +    struct ifnet *ifp, struct rtentry *rt)
> +{
> +     struct mbuf             *m = *m0;
> +     struct mbuf_list         fml;
> +     struct pf_fragment_tag  *ftag = (struct pf_fragment_tag *)(mtag + 1);
> +     u_int32_t                mtu;
> +     u_int16_t                maxlen, hdrlen;
> +     int                      error;
> +
> +     hdrlen = ftag->ft_hdrlen;
> +     maxlen = ftag->ft_maxlen;
> +     m_tag_delete(m, mtag);
> +     mtag = NULL;
> +     ftag = NULL;
> +
> +     /* Checksum must be calculated for the whole packet */
> +     in_proto_cksum_out(m, NULL);
> +
> +     mtu = hdrlen + maxlen;
> +     error = ip_fragment(m, &fml, ifp, mtu);
> +     *m0 = NULL;     /* ip_fragment() has consumed original packet. */
> +     if (error) {
> +             DPFPRINTF(LOG_NOTICE, "ip_fragment error %d", error);
> +             return (PF_DROP);
> +     }
> +
> +     while ((m = ml_dequeue(&fml)) != NULL) {
> +             m->m_pkthdr.pf.flags |= PF_TAG_REFRAGMENTED;
> +             if (ifp == NULL)
> +                     ip_output(m, NULL, NULL, IP_RAWOUTPUT, NULL, NULL, 0);
> +             else
> +                     ifp->if_output(ifp, m, sintosa(dst), rt);
> +     }
> +
>       return (PF_PASS);
>  }
>  
> Index: sys/net/pfvar.h
> ===================================================================
> RCS file: /cvs/src/sys/net/pfvar.h,v
> retrieving revision 1.502
> diff -u -p -r1.502 pfvar.h
> --- sys/net/pfvar.h   23 Jun 2021 06:53:52 -0000      1.502
> +++ sys/net/pfvar.h   31 Aug 2021 12:42:51 -0000
> @@ -1329,6 +1329,7 @@ struct pf_status {
>  
>  #define PF_REASS_ENABLED     0x01
>  #define PF_REASS_NODF                0x02
> +#define PF_REASS_REFRAG              0x04
>  
>  #define      PF_SYNCOOKIES_NEVER     0
>  #define      PF_SYNCOOKIES_ALWAYS    1
> @@ -1770,6 +1771,8 @@ int     pf_match_gid(u_int8_t, gid_t, gid_t,
>  
>  int  pf_refragment6(struct mbuf **, struct m_tag *mtag,
>           struct sockaddr_in6 *, struct ifnet *, struct rtentry *);
> +int  pf_refragment4(struct mbuf **, struct m_tag *mtag,
> +         struct sockaddr_in *, struct ifnet *, struct rtentry *);
>  void pf_normalize_init(void);
>  int  pf_normalize_ip(struct pf_pdesc *, u_short *);
>  int  pf_normalize_ip6(struct pf_pdesc *, u_short *);
> Index: share/man/man5/pf.conf.5
> ===================================================================
> RCS file: /cvs/src/share/man/man5/pf.conf.5,v
> retrieving revision 1.587
> diff -u -p -r1.587 pf.conf.5
> --- share/man/man5/pf.conf.5  19 Jul 2021 16:23:56 -0000      1.587
> +++ share/man/man5/pf.conf.5  31 Aug 2021 12:42:51 -0000
> @@ -1317,7 +1317,7 @@ Alias for
>  .Pp
>  The default value is
>  .Cm normal .
> -.It Ic set Cm reassemble yes | no Op Cm no-df
> +.It Ic set Cm reassemble yes | no Oo Cm no-df Oc Oo Cm refragment Oc
>  The
>  .Cm reassemble
>  option is used to enable or disable the reassembly of fragmented packets,
> @@ -1336,6 +1336,10 @@ the reassembled packet will have the
>  bit cleared.
>  The default value is
>  .Cm yes .
> +If
> +.Cm refragment
> +is also specified, a reassembled packet will be refragmented before
> +being forwarded.
>  .It Ic set Cm ruleset-optimization Ar level
>  .Bl -tag -width profile -compact
>  .It Cm basic
> @@ -2775,7 +2779,10 @@ option         = "set" ( [ "timeout" ( t
>                   [ "skip on" ifspec ] |
>                   [ "debug" ( "emerg" | "alert" | "crit" | "err" |
>                   "warning" | "notice" | "info" | "debug" ) ] |
> -                 [ "reassemble" ( "yes" | "no" ) [ "no-df" ] ] )
> +                 [ "reassemble" ( "yes" | "no" ) [ reassembleopts ] ] )
> +
> +reassembleopts = reassembleopt [ reassembleopts ]
> +reassembleopt  = "no-df" | "refragment"
>  
>  pf-rule        = action [ ( "in" | "out" ) ]
>                   [ "log" [ "(" logopts ")"] ] [ "quick" ]
> Index: sbin/pfctl/parse.y
> ===================================================================
> RCS file: /cvs/src/sbin/pfctl/parse.y,v
> retrieving revision 1.709
> diff -u -p -r1.709 parse.y
> --- sbin/pfctl/parse.y        1 Feb 2021 00:31:04 -0000       1.709
> +++ sbin/pfctl/parse.y        31 Aug 2021 12:42:51 -0000
> @@ -470,7 +470,7 @@ int       parseport(char *, struct range *r, i
>  
>  %token       PASS BLOCK MATCH SCRUB RETURN IN OS OUT LOG QUICK ON FROM TO 
> FLAGS
>  %token       RETURNRST RETURNICMP RETURNICMP6 PROTO INET INET6 ALL ANY 
> ICMPTYPE
> -%token       ICMP6TYPE CODE KEEP MODULATE STATE PORT BINATTO NODF
> +%token       ICMP6TYPE CODE KEEP MODULATE STATE PORT BINATTO NODF REFRAGMENT
>  %token       MINTTL ERROR ALLOWOPTS FILENAME ROUTETO DUPTO REPLYTO NO LABEL
>  %token       NOROUTE URPFFAILED FRAGMENT USER GROUP MAXMSS MAXIMUM TTL TOS 
> DROP TABLE
>  %token       REASSEMBLE ANCHOR SYNCOOKIES
> @@ -490,11 +490,12 @@ int     parseport(char *, struct range *r, i
>  %token       <v.i>                   PORTBINARY
>  %type        <v.interface>           interface if_list if_item_not if_item
>  %type        <v.number>              number icmptype icmp6type uid gid
> -%type        <v.number>              tos not yesno optnodf
> +%type        <v.number>              tos not yesno
>  %type        <v.probability>         probability
>  %type        <v.weight>              optweight
>  %type        <v.i>                   dir af optimizer syncookie_val
>  %type        <v.i>                   sourcetrack flush unaryop statelock
> +%type        <v.i>                   setreass_opts setreass_opts_l 
> setreass_opt
>  %type        <v.b>                   action
>  %type        <v.b>                   flags flag blockspec prio
>  %type        <v.range>               portplain portstar portrange
> @@ -588,11 +589,19 @@ optimizer       : string        {
>               }
>               ;
>  
> -optnodf              : /* empty */   { $$ = 0; }
> -             | NODF          { $$ = 1; }
> +setreass_opts        : /* empty */                           { $$ = 0; }
> +             | setreass_opts_l                       { $$ = $1; }
>               ;
>  
> -option               : SET REASSEMBLE yesno optnodf          {
> +setreass_opts_l      : setreass_opts_l setreass_opt          { $$ = $1 | $2; 
> }
> +             | setreass_opt                          { $$ = $1; }
> +             ;
> +
> +setreass_opt : NODF          { $$ = PF_REASS_NODF; }
> +             | REFRAGMENT    { $$ = PF_REASS_REFRAG; }
> +             ;
> +
> +option               : SET REASSEMBLE yesno setreass_opts    {
>                       pfctl_set_reassembly(pf, $3, $4);
>               }
>               | SET OPTIMIZATION STRING               {
> @@ -5014,6 +5023,7 @@ lookup(char *s)
>               { "rdr-to",             RDRTO},
>               { "reassemble",         REASSEMBLE},
>               { "received-on",        RECEIVEDON},
> +             { "refragment",         REFRAGMENT},
>               { "reply-to",           REPLYTO},
>               { "return",             RETURN},
>               { "return-icmp",        RETURNICMP},
> Index: sbin/pfctl/pfctl.c
> ===================================================================
> RCS file: /cvs/src/sbin/pfctl/pfctl.c,v
> retrieving revision 1.383
> diff -u -p -r1.383 pfctl.c
> --- sbin/pfctl/pfctl.c        14 Oct 2020 19:30:37 -0000      1.383
> +++ sbin/pfctl/pfctl.c        31 Aug 2021 12:42:51 -0000
> @@ -1907,20 +1907,20 @@ pfctl_set_synflwats(struct pfctl *pf, u_
>  }
>  
>  int
> -pfctl_set_reassembly(struct pfctl *pf, int on, int nodf)
> +pfctl_set_reassembly(struct pfctl *pf, int on, u_int32_t reass)
>  {
>       pf->reass_set = 1;
>       if (on) {
> -             pf->reassemble = PF_REASS_ENABLED;
> -             if (nodf)
> -                     pf->reassemble |= PF_REASS_NODF;
> +             pf->reassemble = PF_REASS_ENABLED | reass;
>       } else {
>               pf->reassemble = 0;
>       }
>  
> -     if (pf->opts & PF_OPT_VERBOSE)
> -             printf("set reassemble %s %s\n", on ? "yes" : "no",
> -                 nodf ? "no-df" : "");
> +     if (pf->opts & PF_OPT_VERBOSE) {
> +             printf("set reassemble %s%s%s\n", on ? "yes" : "no",
> +                 (reass & PF_REASS_NODF) ? " no-df" : "",
> +                 (reass & PF_REASS_REFRAG) ? " refragment" : "");
> +     }
>  
>       return (0);
>  }
> Index: sbin/pfctl/pfctl_parser.h
> ===================================================================
> RCS file: /cvs/src/sbin/pfctl/pfctl_parser.h,v
> retrieving revision 1.117
> diff -u -p -r1.117 pfctl_parser.h
> --- sbin/pfctl/pfctl_parser.h 21 Jul 2020 14:10:51 -0000      1.117
> +++ sbin/pfctl/pfctl_parser.h 31 Aug 2021 12:42:51 -0000
> @@ -221,7 +221,7 @@ int     add_opt_table(struct pfctl *, st
>  void pfctl_add_rule(struct pfctl *, struct pf_rule *);
>  
>  int  pfctl_set_timeout(struct pfctl *, const char *, int, int);
> -int  pfctl_set_reassembly(struct pfctl *, int, int);
> +int  pfctl_set_reassembly(struct pfctl *, int, u_int32_t);
>  int  pfctl_set_syncookies(struct pfctl *, u_int8_t,
>           struct pfctl_watermarks *);
>  int  pfctl_set_optimization(struct pfctl *, const char *);

Reply via email to