Hello,

David (dlg@) asked me to shrink the scope of the change.  The new diff
introduces a mutex to pf_state and adjust pf_detach_state() to utilize the
newly introduced mutex.  I've also added annotation to pf_state members. The
members with '?' needs our attention. We need to verify if they are either
protected by global state lock, or if we cover them by newly introduced
mutex.

new diff also adds mtx_init() to pf_state_import(). This was missing in
my earlier diff. It was found and reported by Hrvoje@.

thanks and
regards
sashan


--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/pf.c b/sys/net/pf.c
index 2c13c99baac..c6022867dd4 100644
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -185,7 +185,8 @@ int                  pf_translate_icmp_af(struct pf_pdesc*, 
int, void *);
 void                    pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, int,
                            sa_family_t, struct pf_rule *, u_int);
 void                    pf_detach_state(struct pf_state *);
-void                    pf_state_key_detach(struct pf_state *, int);
+void                    pf_state_key_detach(struct pf_state *,
+                           struct pf_state_key *);
 u_int32_t               pf_tcp_iss(struct pf_pdesc *);
 void                    pf_rule_to_actions(struct pf_rule *,
                            struct pf_rule_actions *);
@@ -779,7 +780,8 @@ pf_state_key_attach(struct pf_state_key *sk, struct 
pf_state *s, int idx)
                s->key[idx] = sk;
 
        if ((si = pool_get(&pf_state_item_pl, PR_NOWAIT)) == NULL) {
-               pf_state_key_detach(s, idx);
+               pf_state_key_detach(s, s->key[idx]);
+               s->key[idx] = NULL;
                return (-1);
        }
        si->s = s;
@@ -799,42 +801,50 @@ pf_state_key_attach(struct pf_state_key *sk, struct 
pf_state *s, int idx)
 void
 pf_detach_state(struct pf_state *s)
 {
-       if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
-               s->key[PF_SK_WIRE] = NULL;
+       struct pf_state_key *key[2];
+
+       mtx_enter(&s->mtx);
+       key[PF_SK_WIRE] = s->key[PF_SK_WIRE];
+       key[PF_SK_STACK] = s->key[PF_SK_STACK];
+       s->key[PF_SK_WIRE] = NULL;
+       s->key[PF_SK_STACK] = NULL;
+       mtx_leave(&s->mtx);
+
+       if (key[PF_SK_WIRE] == key[PF_SK_STACK])
+               key[PF_SK_WIRE] = NULL;
 
-       if (s->key[PF_SK_STACK] != NULL)
-               pf_state_key_detach(s, PF_SK_STACK);
+       if (key[PF_SK_STACK] != NULL)
+               pf_state_key_detach(s, key[PF_SK_STACK]);
 
-       if (s->key[PF_SK_WIRE] != NULL)
-               pf_state_key_detach(s, PF_SK_WIRE);
+       if (key[PF_SK_WIRE] != NULL)
+               pf_state_key_detach(s, key[PF_SK_WIRE]);
 }
 
 void
-pf_state_key_detach(struct pf_state *s, int idx)
+pf_state_key_detach(struct pf_state *s, struct pf_state_key *key)
 {
        struct pf_state_item    *si;
-       struct pf_state_key     *sk;
 
-       if (s->key[idx] == NULL)
+       PF_STATE_ASSERT_LOCKED();
+
+       if (key == NULL)
                return;
 
-       si = TAILQ_FIRST(&s->key[idx]->states);
+       si = TAILQ_FIRST(&key->states);
        while (si && si->s != s)
            si = TAILQ_NEXT(si, entry);
 
        if (si) {
-               TAILQ_REMOVE(&s->key[idx]->states, si, entry);
+               TAILQ_REMOVE(&key->states, si, entry);
                pool_put(&pf_state_item_pl, si);
        }
 
-       sk = s->key[idx];
-       s->key[idx] = NULL;
-       if (TAILQ_EMPTY(&sk->states)) {
-               RB_REMOVE(pf_state_tree, &pf_statetbl, sk);
-               sk->removed = 1;
-               pf_state_key_unlink_reverse(sk);
-               pf_state_key_unlink_inpcb(sk);
-               pf_state_key_unref(sk);
+       if (TAILQ_EMPTY(&key->states)) {
+               RB_REMOVE(pf_state_tree, &pf_statetbl, key);
+               key->removed = 1;
+               pf_state_key_unlink_reverse(key);
+               pf_state_key_unlink_inpcb(key);
+               pf_state_key_unref(key);
        }
 }
 
@@ -997,7 +1007,9 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key 
**skw,
                }
                *skw = s->key[PF_SK_WIRE];
                if (pf_state_key_attach(*sks, s, PF_SK_STACK)) {
-                       pf_state_key_detach(s, PF_SK_WIRE);
+                       pf_state_key_detach(s, s->key[PF_SK_WIRE]);
+                       s->key[PF_SK_WIRE] = NULL;
+                       *skw = NULL;
                        PF_STATE_EXIT_WRITE();
                        return (-1);
                }
@@ -1429,6 +1441,7 @@ pf_state_import(const struct pfsync_state *sp, int flags)
        st->sync_state = PFSYNC_S_NONE;
 
        refcnt_init(&st->refcnt);
+       mtx_init(&st->mtx, IPL_NET);
 
        /* XXX when we have anchors, use STATE_INC_COUNTERS */
        r->states_cur++;
@@ -4348,6 +4361,7 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, 
struct pf_rule *a,
         * pf_state_inserts() grabs reference for pfsync!
         */
        refcnt_init(&s->refcnt);
+       mtx_init(&s->mtx, IPL_NET);
 
        switch (pd->proto) {
        case IPPROTO_TCP:
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index cc6c9f3dedc..9a75f9a1e6c 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -743,37 +743,49 @@ struct pf_state_cmp {
        u_int8_t                 pad[3];
 };
 
+/*
+ * pf_state structure members require protection
+ * either by state lock (a global rw-lock) or
+ * needs to be protected by mutex which is part
+ * of state. The annotation goes as follows:
+ *     - stateRW: means protected by exclusive state lock
+ *     - mtx: protected by mutex (pf_state::state_mtx)
+ *     - syncMTX: protected by mutex in pfsync
+ *     - I: immutable since creation
+ *     - ?: needs to be revisited (require mtx).
+ */
 struct pf_state {
-       u_int64_t                id;
-       u_int32_t                creatorid;
-       u_int8_t                 direction;
+       u_int64_t                id;            /* I */
+       u_int32_t                creatorid;     /* I */
+       u_int8_t                 direction;     /* I */
        u_int8_t                 pad[3];
 
-       TAILQ_ENTRY(pf_state)    sync_list;
-       TAILQ_ENTRY(pf_state)    sync_snap;
-       TAILQ_ENTRY(pf_state)    entry_list;
-       SLIST_ENTRY(pf_state)    gc_list;
-       RB_ENTRY(pf_state)       entry_id;
+       TAILQ_ENTRY(pf_state)    sync_list;     /* syncMTX */
+       TAILQ_ENTRY(pf_state)    sync_snap;     /* syncMTX */
+       TAILQ_ENTRY(pf_state)    entry_list;    /* stateRW */
+       SLIST_ENTRY(pf_state)    gc_list;       /* syncMTX */
+       RB_ENTRY(pf_state)       entry_id;      /* stateRW */
        struct pf_state_peer     src;
        struct pf_state_peer     dst;
-       struct pf_rule_slist     match_rules;
-       union pf_rule_ptr        rule;
-       union pf_rule_ptr        anchor;
-       union pf_rule_ptr        natrule;
-       struct pf_addr           rt_addr;
-       struct pf_sn_head        src_nodes;
-       struct pf_state_key     *key[2];        /* addresses stack and wire  */
-       struct pfi_kif          *kif;
-       u_int64_t                packets[2];
-       u_int64_t                bytes[2];
-       int32_t                  creation;
-       int32_t                  expire;
-       int32_t                  pfsync_time;
-       int                      rtableid[2];   /* rtables stack and wire */
-       u_int16_t                qid;
-       u_int16_t                pqid;
-       u_int16_t                tag;
-       u_int16_t                state_flags;
+       struct pf_rule_slist     match_rules;   /* I */
+       union pf_rule_ptr        rule;          /* I */
+       union pf_rule_ptr        anchor;        /* I */
+       union pf_rule_ptr        natrule;       /* I */
+       struct pf_addr           rt_addr;       /* I */
+       struct pf_sn_head        src_nodes;     /* I */
+       struct pf_state_key     *key[2];        /* mtx, stack and wire  */
+       struct pfi_kif          *kif;           /* I */
+       struct mutex             mtx;
+       u_int64_t                packets[2];    /* ? */
+       u_int64_t                bytes[2];      /* ? */
+       int32_t                  creation;      /* I */
+       int32_t                  expire;        /* ? */
+       int32_t                  pfsync_time;   /* ? */
+       int                      rtableid[2];   /* I rtables stack and wire */
+       u_int16_t                qid;           /* I */
+       u_int16_t                pqid;          /* I */
+       u_int16_t                tag;           /* I */
+       u_int16_t                state_flags;   /* ? */
 #define        PFSTATE_ALLOWOPTS       0x0001
 #define        PFSTATE_SLOPPY          0x0002
 #define        PFSTATE_PFLOW           0x0004
@@ -787,20 +799,20 @@ struct pf_state {
 #define        PFSTATE_INP_UNLINKED    0x0400
 #define        PFSTATE_SCRUBMASK 
(PFSTATE_NODF|PFSTATE_RANDOMID|PFSTATE_SCRUB_TCP)
 #define        PFSTATE_SETMASK   (PFSTATE_SETTOS|PFSTATE_SETPRIO)
-       u_int8_t                 log;
-       u_int8_t                 timeout;
-       u_int8_t                 sync_state; /* PFSYNC_S_x */
-       u_int8_t                 sync_updates;
-       u_int8_t                 min_ttl;
-       u_int8_t                 set_tos;
-       u_int8_t                 set_prio[2];
-       u_int16_t                max_mss;
-       u_int16_t                if_index_in;
-       u_int16_t                if_index_out;
+       u_int8_t                 log;           /* I */
+       u_int8_t                 timeout;       /* ? */
+       u_int8_t                 sync_state;    /* ? PFSYNC_S_x */
+       u_int8_t                 sync_updates;  /* ? */
+       u_int8_t                 min_ttl;       /* I */
+       u_int8_t                 set_tos;       /* I */
+       u_int8_t                 set_prio[2];   /* I */
+       u_int16_t                max_mss;       /* I */
+       u_int16_t                if_index_in;   /* I */
+       u_int16_t                if_index_out;  /* I */
        pf_refcnt_t              refcnt;
-       u_int16_t                delay;
-       u_int8_t                 rt;
-       u_int8_t                 snapped;
+       u_int16_t                delay;         /* I */
+       u_int8_t                 rt;            /* I */
+       u_int8_t                 snapped;       /* syncMTX */
 };
 
 /*

Reply via email to