[Patch 1/6] IPSEC: core updates

2006-01-27 Thread jamal

the core changes

cheers,
jamal

This patch provides the core functionality needed for sync events
for ipsec.

Signed-off-by: Jamal Hadi Salim <[EMAIL PROTECTED]>
---

 include/linux/sysctl.h |3 ++
 include/linux/xfrm.h   |   30 
 include/net/xfrm.h |   33 +
 net/core/sysctl_net_core.c |   32 +
 net/xfrm/xfrm_state.c  |   83 +++-
 5 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 8352a7c..d5d8156 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -258,6 +258,9 @@ enum
NET_CORE_DEV_WEIGHT=17,
NET_CORE_SOMAXCONN=18,
NET_CORE_BUDGET=19,
+   NET_CORE_AEVENT_ON=20,
+   NET_CORE_AEVENT_ETIME=21,
+   NET_CORE_AEVENT_RSEQTH=22,
 };
 
 /* /proc/sys/net/ethernet */
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 82fbb75..b54a129 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -156,6 +156,10 @@ enum {
XFRM_MSG_FLUSHPOLICY,
 #define XFRM_MSG_FLUSHPOLICY XFRM_MSG_FLUSHPOLICY
 
+   XFRM_MSG_NEWAE,
+#define XFRM_MSG_NEWAE XFRM_MSG_NEWAE
+   XFRM_MSG_GETAE,
+#define XFRM_MSG_GETAE XFRM_MSG_GETAE
__XFRM_MSG_MAX
 };
 #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1)
@@ -194,6 +198,21 @@ struct xfrm_encap_tmpl {
xfrm_address_t  encap_oa;
 };
 
+/* AEVENT flags  */
+enum xfrm_ae_ftype_t {
+   XFRM_AE_UNSPEC,
+   XFRM_AE_RTHR=1, /* replay threshold*/
+   XFRM_AE_RVAL=2, /* replay value */
+   XFRM_AE_LVAL=4, /* lifetime value */
+   XFRM_AE_ETHR=8, /* expiry timer threshold */
+   XFRM_AE_CR=16, /* Event cause is replay update */
+   XFRM_AE_CE=32, /* Event cause is timer expiry */
+   XFRM_AE_CU=64, /* Event cause is policy update */
+   __XFRM_AE_MAX
+
+#define XFRM_AE_MAX (__XFRM_AE_MAX - 1)
+};
+
 /* Netlink message attributes.  */
 enum xfrm_attr_type_t {
XFRMA_UNSPEC,
@@ -205,6 +224,10 @@ enum xfrm_attr_type_t {
XFRMA_SA,
XFRMA_POLICY,
XFRMA_SEC_CTX,  /* struct xfrm_sec_ctx */
+   XFRMA_LTIME_VAL,
+   XFRMA_REPLAY_VAL,
+   XFRMA_REPLAY_THRESH,
+   XFRMA_ETIMER_THRESH,
__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -235,6 +258,11 @@ struct xfrm_usersa_id {
__u8proto;
 };
 
+struct xfrm_aevent_id {
+   __u32   flags;
+   struct xfrm_usersa_id   sa_id;
+};
+
 struct xfrm_userspi_info {
struct xfrm_usersa_info info;
__u32   min;
@@ -306,6 +334,8 @@ enum xfrm_nlgroups {
 #define XFRMNLGRP_SA   XFRMNLGRP_SA
XFRMNLGRP_POLICY,
 #define XFRMNLGRP_POLICY   XFRMNLGRP_POLICY
+   XFRMNLGRP_AEVENTS,
+#define XFRMNLGRP_AEVENTS  XFRMNLGRP_AEVENTS
__XFRMNLGRP_MAX
 };
 #define XFRMNLGRP_MAX  (__XFRMNLGRP_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d09ca0e..4b5140a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,6 +20,10 @@
 
 #define XFRM_ALIGN8(len)   (((len) + 7) & ~7)
 
+extern u32 sysctl_xfrm_aevent_on;
+extern u32 sysctl_xfrm_aevent_etime;
+extern u32 sysctl_xfrm_aevent_rseqth;
+
 extern struct semaphore xfrm_cfg_sem;
 
 /* Organization of SPD aka "XFRM rules"
@@ -135,6 +139,16 @@ struct xfrm_state
/* State for replay detection */
struct xfrm_replay_state replay;
 
+   /* Replay detection state at the time we sent the last notification */
+   struct xfrm_replay_state preplay;
+
+   /* Replay detection notification settings */
+   u32 replay_maxage;
+   u32 replay_maxdiff;
+
+   /* Replay detection notification timer */
+   struct timer_list   rtimer;
+
/* Statistics */
struct xfrm_stats   stats;
 
@@ -169,6 +183,7 @@ struct km_event
u32 hard;
u32 proto;
u32 byid;
+   u32 aevent;
} data;
 
u32 seq;
@@ -306,7 +321,22 @@ struct xfrm_policy
struct xfrm_tmplxfrm_vec[XFRM_MAX_DEPTH];
 };
 
-#define XFRM_KM_TIMEOUT30
+/* which seqno */
+#define XFRM_REPLAY_SEQ1
+#define XFRM_REPLAY_OSEQ   2
+#define XFRM_REPLAY_SEQ_MASK   3
+/* what happened */
+#define XFRM_REPLAY_UPDATE XFRM_AE_CR
+#define XFRM_REPLAY_TIMEOUTXFRM_AE_CE
+
+#define XFRM_KM_TIMEOUT30
+
+/* default aevent timeout in units of 100ms */
+#define XFRM_AE_ETIME  10
+/* Async Event timer multiplier */
+#define XFRM_AE_ETH_M  10
+/* default seq threshold size */
+#define XFRM_AE_SEQT_SIZE  2
 
 struct xfrm_mgr
 {
@@ -861,6 +891,7 @@ extern int xfrm_state_delete(struct xfrm
 extern void xfrm_state_flush(u8 proto);
 extern int xfrm_replay_check(struct xfrm_state *x, u32 seq);
 extern void xfrm

Re: [Patch 1/6] IPSEC: core updates

2006-01-28 Thread Herbert Xu
On Fri, Jan 27, 2006 at 08:05:23AM -0500, jamal wrote:
> 
> the core changes

Thanks for the patches Jamal.  I agree with the basic idea.
  
> +extern u32 sysctl_xfrm_aevent_on;

I'd prefer for this to be automatically determined.  Indeed, this is
a generic netlink problem.  We want to be easily determine at run time
whether there are netlink sockets subscribed to a given multicast group.

Here is an idea, in addition to mc_list, we keep a bit mask of the groups
that exist on mc_list.  This bit mask can then be used to quickly determine
if a netlink skb needs to be generated or not.

This mask would be updated at bind(2) time.  Updating this is O(n) where
n is the number of members on mc_list.  This should be fine since
netlink_broadcast itself which occurs much more often is also O(n).

> +extern u32 sysctl_xfrm_aevent_etime;
> +extern u32 sysctl_xfrm_aevent_rseqth;

Why do we need these defaults? I'd rather see these be removed and just have
the user-space KM always set the values (if it needs aevent).

> diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
> index e12d0be..2ccf87a 100644
> --- a/net/xfrm/xfrm_state.c
> +++ b/net/xfrm/xfrm_state.c
>
> @@ -62,6 +65,8 @@ static void xfrm_state_gc_destroy(struct
>  {
>   if (del_timer(&x->timer))
>   BUG();
> + if (del_timer(&x->rtimer))
> + BUG();

How about just using x->timer?

> @@ -104,6 +109,20 @@ static inline unsigned long make_jiffies
>   return secs*HZ;
>  }
>  
> +void xfrm_replay_notify(struct xfrm_state *, int);

Duplicate prototype.

> @@ -762,6 +792,50 @@ out:
>  }
>  EXPORT_SYMBOL(xfrm_state_walk);
>  
> +
> +void xfrm_replay_notify(struct xfrm_state *x, int event)
> +{
> + struct km_event c;
> + /* we send notify messages in case
> +  *  1. we updated on of the sequence numbers, and the seqno difference
> +  * is at least x->replay_maxdiff, in this case we also update the
> +  * timeout of our timer function
> +  *  2. if x->replay_maxage has elapsed since last update,
> +  * and there were changes
> +  *
> +  *  The state structure must be locked!
> +  */
> +
> + switch (event) {
> + case XFRM_REPLAY_UPDATE:
> + if (x->replay_maxdiff &&
> + (x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
> + (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff))
> + return;
> +
> + break;
> +
> + case XFRM_REPLAY_TIMEOUT:
> + if ((x->replay.seq == x->preplay.seq) &&
> + (x->replay.bitmap == x->preplay.bitmap) &&
> + (x->replay.oseq == x->preplay.oseq))
> + goto resched;
> +
> + break;

This seems to be counter-intuitive.  Wouldn't it make more sense to
schedule a timer in the XFRM_REPLAY_UPDATE case, and not schedule one
in the XFRM_REPLAY_TIMEOUT case? Scheduling it in the XFRM_REPLAY_TIMEOUT
case means that you may be waking up every maxage jiffies even when
nothing at all is happening.  While doing it the other way means that
you only schedule it when something has happened and we've suppressed
the event due to maxdiff.

> @@ -805,8 +879,10 @@ void xfrm_replay_advance(struct xfrm_sta
>   diff = x->replay.seq - seq;
>   x->replay.bitmap |= (1U << diff);
>   }
> +
> + if (sysctl_xfrm_aevent_on)
> + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);

This construct occurs many times.  How about putting it into an inline
function?

>  }
> -EXPORT_SYMBOL(xfrm_replay_advance);

Huh?

> @@ -835,7 +911,7 @@ void km_state_notify(struct xfrm_state *
>  EXPORT_SYMBOL(km_policy_notify);
>  EXPORT_SYMBOL(km_state_notify);
>  
> -static void km_state_expired(struct xfrm_state *x, int hard)
> +void km_state_expired(struct xfrm_state *x, int hard)
>  {
>   struct km_event c;

This hunk should go into the SA expire patch.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-01-28 Thread Patrick McHardy
Herbert Xu wrote:
> On Fri, Jan 27, 2006 at 08:05:23AM -0500, jamal wrote:
> 
>>+extern u32 sysctl_xfrm_aevent_on;
> 
> 
> I'd prefer for this to be automatically determined.  Indeed, this is
> a generic netlink problem.  We want to be easily determine at run time
> whether there are netlink sockets subscribed to a given multicast group.
> 
> Here is an idea, in addition to mc_list, we keep a bit mask of the groups
> that exist on mc_list.  This bit mask can then be used to quickly determine
> if a netlink skb needs to be generated or not.
> 
> This mask would be updated at bind(2) time.  Updating this is O(n) where
> n is the number of members on mc_list.  This should be fine since
> netlink_broadcast itself which occurs much more often is also O(n).

I already have a patch for this for ctnetlink. I'll try to update it
and post it sometime this weekend.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-01-28 Thread jamal
On Sat, 2006-28-01 at 20:49 +1100, Herbert Xu wrote:
> On Fri, Jan 27, 2006 at 08:05:23AM -0500, jamal wrote:

> > +extern u32 sysctl_xfrm_aevent_on;
> 
> I'd prefer for this to be automatically determined.  Indeed, this is
> a generic netlink problem.  We want to be easily determine at run time
> whether there are netlink sockets subscribed to a given multicast group.
> 

You can reduce the problem to that, yes. And looking at Patrick's email
on ctnetlink, it is a similar problem.

The sysctl came after i saw the amount of events generated while
testing. Here are some numbers for you on one of my tests:
Initially i started with a default of replay threshold being 1/2 of each
SAs replay window. I was able to connect to a simulated 100 IKE peers on
a big xeon and send 1Kpps each direction. This resulted in an event
every second packet, every SA, every direction ;-> Thats (1K/2)*100*2
which is 100K events/second. I believe ctnetlink would be even worse
than this.

> Here is an idea, in addition to mc_list, we keep a bit mask of the groups
> that exist on mc_list.  This bit mask can then be used to quickly determine
> if a netlink skb needs to be generated or not.
> 
> This mask would be updated at bind(2) time.  Updating this is O(n) where
> n is the number of members on mc_list.  This should be fine since
> netlink_broadcast itself which occurs much more often is also O(n).
> 

Two things:

- The sysctl does a little more than this in that even if someone
joined, and the sysctl was not set then they get no events. I have not
used it in that perspective myself but for some reason in my notes i
have jotted down it as being useful. If i cant find a good reason then
it is not needed ;->

- if we auto-discover based on groups being subscribed-to, then the
lower we stop generating events the better; In this example at the point
where the replay is being updated in the fast path as opposed to all the
way when we want to do a sendmsg. This could be achieved if somehow the
group being subscribed to automagically sets sysctl_xfrm_aevent_on and
clears it when the last unsubscription happens - that would be
fantastic. I will wait to see what Patrick posts

> > +extern u32 sysctl_xfrm_aevent_etime;
> > +extern u32 sysctl_xfrm_aevent_rseqth;
> 
> Why do we need these defaults? I'd rather see these be removed and just have
> the user-space KM always set the values (if it needs aevent).
> 

This is mostly out of laziness when i started but has turned out to be
useful. For ease of testing,  i didnt want to set anything or make any
changes to a KM.
My initial attempt was to default to a time of 1 second and a replay
threshold of 1/2 the replay window. I was frustrated using racoon to
find it was by default setting the window to 4 ;-> (otoh, pluto sets it
to a very high number). I also couldnt justify why 1 sec or 10 sec or 30
seconds as i kept changing them.
For this reason i figured the admin would be the better person to make
that decision and i picked the defaults based on what i was
experimenting with.
I could remove them if this doesnt sound like a good reason.
 
> > diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
> > index e12d0be..2ccf87a 100644
> > --- a/net/xfrm/xfrm_state.c
> > +++ b/net/xfrm/xfrm_state.c
> >
> > @@ -62,6 +65,8 @@ static void xfrm_state_gc_destroy(struct
> >  {
> > if (del_timer(&x->timer))
> > BUG();
> > +   if (del_timer(&x->rtimer))
> > +   BUG();
> 
> How about just using x->timer?
> 

This part of the patch i left as is from Krisztian's original patch.
i have another timer i was going to add later (an idle timeout
to emulate a certain vendor when an SA is idle) and felt i could use
rtimer for that as well. For this reason i left it alone since it just
worked ;->

How expensive are timers these days? Lets handwave a moderately large
number of SAs (100K). With this we would have 200K timers as opposed to
100K if we aggregated them.

> > @@ -104,6 +109,20 @@ static inline unsigned long make_jiffies
> > return secs*HZ;
> >  }
> >  
> > +void xfrm_replay_notify(struct xfrm_state *, int);
> 
> Duplicate prototype.
> 

I am experimenting with stgit which gives you git and ability to stack
patches - I noticed actually it fscked me in 2 other patches or i could
be misusing it. I will fix this in the final version.
What do you guys use to stack patches?


[..]
> 
> This seems to be counter-intuitive.  Wouldn't it make more sense to
> schedule a timer in the XFRM_REPLAY_UPDATE case, and not schedule one
> in the XFRM_REPLAY_TIMEOUT case? Scheduling it in the XFRM_REPLAY_TIMEOUT
> case means that you may be waking up every maxage jiffies even when
> nothing at all is happening.  While doing it the other way means that
> you only schedule it when something has happened and we've suppressed
> the event due to maxdiff.
> 

This is part of Krisztian's original patch - he would be a better person
to respond to if we can wake him up. 

Waking up when nothing is hap

Re: [Patch 1/6] IPSEC: core updates

2006-01-29 Thread KOVACS Krisztian

  Hi,

On Saturday 28 January 2006 13:45, jamal wrote:
> > > +extern u32 sysctl_xfrm_aevent_etime;
> > > +extern u32 sysctl_xfrm_aevent_rseqth;
> >
> > Why do we need these defaults? I'd rather see these be removed and
> > just have the user-space KM always set the values (if it needs
> > aevent).
>
> This is mostly out of laziness when i started but has turned out to be
> useful. For ease of testing,  i didnt want to set anything or make any
> changes to a KM.
> My initial attempt was to default to a time of 1 second and a replay
> threshold of 1/2 the replay window. I was frustrated using racoon to
> find it was by default setting the window to 4 ;-> (otoh, pluto sets it
> to a very high number). I also couldnt justify why 1 sec or 10 sec or
> 30 seconds as i kept changing them.
> For this reason i figured the admin would be the better person to make
> that decision and i picked the defaults based on what i was
> experimenting with.
> I could remove them if this doesnt sound like a good reason.

  I don't really like the idea of generating events unless explicitly 
requested by the KM. Once a PF_KEY interface is in place such behaviour 
mightl break compatibility with racoon's libipsec which considers unknown 
extension headers as errors.

  Whether or not a PF_KEY interface for these events is desired is also a 
good question. Initially I had a patch for PF_KEY, but only because we 
used racoon and it was easier to implement a pfkey extension than adding 
netlink support to racoon. Such PF_KEY extensions would just add even 
more types to the growing list of non-standard PF_KEY extension headers.

  To put it simple: I don't think PF_KEY is worth the hassle unless 
someone comes up with an open source software utilizing that interface.

> > > diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
> > > index e12d0be..2ccf87a 100644
> > > --- a/net/xfrm/xfrm_state.c
> > > +++ b/net/xfrm/xfrm_state.c
> > >
> > > @@ -62,6 +65,8 @@ static void xfrm_state_gc_destroy(struct
> > >  {
> > >   if (del_timer(&x->timer))
> > >   BUG();
> > > + if (del_timer(&x->rtimer))
> > > + BUG();
> >
> > How about just using x->timer?
>
> This part of the patch i left as is from Krisztian's original patch.
> i have another timer i was going to add later (an idle timeout
> to emulate a certain vendor when an SA is idle) and felt i could use
> rtimer for that as well. For this reason i left it alone since it just
> worked ;->
>
> How expensive are timers these days? Lets handwave a moderately large
> number of SAs (100K). With this we would have 200K timers as opposed to
> 100K if we aggregated them.

  Of course we could aggregate these timers, this is just a leftover from 
the very first patch. However, this would mean that a more fine-grained 
timing would be necessary for that timer instead of the current  
precision of one second. Which is of course not a problem, just makes the 
patch a bit more intrusive.

> > This seems to be counter-intuitive.  Wouldn't it make more sense to
> > schedule a timer in the XFRM_REPLAY_UPDATE case, and not schedule one
> > in the XFRM_REPLAY_TIMEOUT case? Scheduling it in the
> > XFRM_REPLAY_TIMEOUT case means that you may be waking up every maxage
> > jiffies even when nothing at all is happening.  While doing it the
> > other way means that you only schedule it when something has happened
> > and we've suppressed the event due to maxdiff.
>
> This is part of Krisztian's original patch - he would be a better
> person to respond to if we can wake him up.
>
> Waking up when nothing is happening is OK, but waking up and generating
> events when nothing is happening is not. so the rescheduling is fine as
> far as i can see but what you describe is an optimization i.e
> knowing nothing has happened and never waking up is even better.
> Krisztian?

  Sure, pretty good point.

  One more thing: whether or not this timer is necessary really depends on 
how the sync daemon uses these events. The most simple trick of simply 
adding a large-enough constant to the sequence numbers of outgoing 
packets after failover does not take advantage of this feature. Of course 
if someone comes up with a more exact way of handling failover cases, 
then those additional update messages could be useful.

-- 
 Regards,
  Krisztian Kovacs
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-01-30 Thread jamal
Olla Krisztian,

Thanks for taking the time.

On Sun, 2006-29-01 at 22:54 +0100, KOVACS Krisztian wrote:
>   Hi,
> 
> On Saturday 28 January 2006 13:45, jamal wrote:
[..]
>   I don't really like the idea of generating events unless explicitly 
> requested by the KM. Once a PF_KEY interface is in place such behaviour 
> mightl break compatibility with racoon's libipsec which considers unknown 
> extension headers as errors.
> 
>
>   Whether or not a PF_KEY interface for these events is desired is also a 
> good question. Initially I had a patch for PF_KEY, but only because we 
> used racoon and it was easier to implement a pfkey extension than adding 
> netlink support to racoon. Such PF_KEY extensions would just add even 
> more types to the growing list of non-standard PF_KEY extension headers.
> 
>   To put it simple: I don't think PF_KEY is worth the hassle unless 
> someone comes up with an open source software utilizing that interface.
> 

I agree. And if you look at something like sasyncd, it is obvious you
dont need it if what you want to achieve is failover. Infact you dont
need racoon in some cases at all (example this could be done for manual
setup as well).

In the case you do use racoon, the question is ISAKMP SAs; how do you
fail those over? i.e how do you keep racoon-master and racoon-slave
synced - one idea could be to make sure the cookies are synced. I know
in your work you were doing something with racoon - was there anything
of this sort that you did?

[..]
> >
> > How expensive are timers these days? Lets handwave a moderately large
> > number of SAs (100K). With this we would have 200K timers as opposed to
> > 100K if we aggregated them.
> 
>   Of course we could aggregate these timers, this is just a leftover from 
> the very first patch. However, this would mean that a more fine-grained 
> timing would be necessary for that timer instead of the current  
> precision of one second. 

I thought about this after i typed my last email. The more fine grained
the timer, the better the failover time. The SA expiry timers dont need
anything more than 1 sec granularity (infact anything lower than 60
seconds will be considered strange)

> Which is of course not a problem, just makes the patch a bit more intrusive.
> 

true although i am still curious if it may be worth the complexity of
having a single timer - the only reason i can see for aggregation is for
perfomance reasons.

> > Waking up when nothing is happening is OK, but waking up and generating
> > events when nothing is happening is not. so the rescheduling is fine as
> > far as i can see but what you describe is an optimization i.e
> > knowing nothing has happened and never waking up is even better.
> > Krisztian?
> 
>   Sure, pretty good point.
> 

Ok, I will fix this.

>   One more thing: whether or not this timer is necessary really depends on 
> how the sync daemon uses these events. The most simple trick of simply 
> adding a large-enough constant to the sequence numbers of outgoing 
> packets after failover does not take advantage of this feature. Of course 
> if someone comes up with a more exact way of handling failover cases, 
> then those additional update messages could be useful.

The timer allows for more exact values. IOW, if you use the timer and
you dont loose any events you can _guarantee_ exact numbers; with
padding the replay it becomes:
a) if you make it small more of how lucky you are in choosing a good
padding or 
b) dropping a substantial amount of packets if you use brute force and
make it large. 

The trick of padding the replay values on the backup node could still be
used in addition to the timer but only a small padding is needed.
In any case, this mechanism is still needed/valuable.

cheers,
jamal

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-01-30 Thread KOVACS Krisztian

  Hi,

On Monday 30 January 2006 14.14, jamal wrote:
[...]
> >   To put it simple: I don't think PF_KEY is worth the hassle unless
> > someone comes up with an open source software utilizing that interface.
>
> I agree. And if you look at something like sasyncd, it is obvious you
> dont need it if what you want to achieve is failover. Infact you dont
> need racoon in some cases at all (example this could be done for manual
> setup as well).
>
> In the case you do use racoon, the question is ISAKMP SAs; how do you
> fail those over? i.e how do you keep racoon-master and racoon-slave
> synced - one idea could be to make sure the cookies are synced. I know
> in your work you were doing something with racoon - was there anything
> of this sort that you did?

  We implemented partial ISAKMP SA synchronization in racoon. That way the 
cookies, the shared secrets, etc. were synchronized to the slaves, so that 
after failing over the new master could even do a phase2 exchange using the 
previously negotiated shared secret. In this case a PF_KEY interface is a 
good thing because that way you can have racoon

a) set up the desired event parameters for the SAs (no need to use the 
sysctl-controlled global default setting)

b) do the IPSEC SA synchronization as well, utilizing the already existing 
framework for master-slave communication (which is necessary for ISAKMP SA 
synchronization anyway).

> > > How expensive are timers these days? Lets handwave a moderately large
> > > number of SAs (100K). With this we would have 200K timers as opposed
> > > to 100K if we aggregated them.
> >
> >   Of course we could aggregate these timers, this is just a leftover
> > from the very first patch. However, this would mean that a more
> > fine-grained timing would be necessary for that timer instead of the
> > current precision of one second.
>
> I thought about this after i typed my last email. The more fine grained
> the timer, the better the failover time. The SA expiry timers dont need
> anything more than 1 sec granularity (infact anything lower than 60
> seconds will be considered strange)
>
> > Which is of course not a problem, just makes the patch a bit more
> > intrusive.
>
> true although i am still curious if it may be worth the complexity of
> having a single timer - the only reason i can see for aggregation is for
> perfomance reasons.

  Exactly, that's what I was trying to say. Although if you have a lot of 
SAs the memory needed by an additional timer_list may be significant (about 
48 bytes per SA on 64 bit architectures if my memory serves me well).

  BTW, did anyone do any testing with such a high amount of SAs in place? I 
would be seriously interested in the results, especially if some user-space 
KM is involved as well. I only have some limited (lab environment) 
experience with racoon, which is unable to handle more than a couple 
hundred SAs when running on Linux.

> >   One more thing: whether or not this timer is necessary really depends
> > on how the sync daemon uses these events. The most simple trick of
> > simply adding a large-enough constant to the sequence numbers of
> > outgoing packets after failover does not take advantage of this
> > feature. Of course if someone comes up with a more exact way of
> > handling failover cases, then those additional update messages could be
> > useful.
>
> The timer allows for more exact values. IOW, if you use the timer and
> you dont loose any events you can _guarantee_ exact numbers; with
> padding the replay it becomes:
> a) if you make it small more of how lucky you are in choosing a good
> padding or
> b) dropping a substantial amount of packets if you use brute force and
> make it large.
>
> The trick of padding the replay values on the backup node could still be
> used in addition to the timer but only a small padding is needed.
> In any case, this mechanism is still needed/valuable.

  Indeed, but this value depends on whether or not the user-space is clever 
enough to use it. Let's suppose it will be. :)

-- 
 Regards,
  Krisztian Kovacs
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-01-30 Thread jamal
Olla,

On Mon, 2006-30-01 at 15:33 +0100, KOVACS Krisztian wrote:
> On Monday 30 January 2006 14.14, jamal wrote:
[..]
>   We implemented partial ISAKMP SA synchronization in racoon. That way the 
> cookies, the shared secrets, etc. were synchronized to the slaves, so that 
> after failing over the new master could even do a phase2 exchange using the 
> previously negotiated shared secret. In this case a PF_KEY interface is a 
> good thing because that way you can have racoon
> 
> a) set up the desired event parameters for the SAs (no need to use the 
> sysctl-controlled global default setting)
> 
> b) do the IPSEC SA synchronization as well, utilizing the already existing 
> framework for master-slave communication (which is necessary for ISAKMP SA 
> synchronization anyway).
> 

Unfortunately this would also mean dependency on racoon. Is there any
other way to do it without having to change racoon? example the phase1
scripts or racoonctl?
It seems to me that the only useful runtime parameter really - dont know
how you would extract this without changing racoon - is peer/local
cookies, no? From that one should be able to generate the SAs.

> > true although i am still curious if it may be worth the complexity of
> > having a single timer - the only reason i can see for aggregation is for
> > perfomance reasons.
> 
>   Exactly, that's what I was trying to say. Although if you have a lot of 
> SAs the memory needed by an additional timer_list may be significant (about 
> 48 bytes per SA on 64 bit architectures if my memory serves me well).
> 

So about 40M for every 100K rules - Dont care really these days for that
kind of RAM as long as performance is not impacted.

>   BTW, did anyone do any testing with such a high amount of SAs in place? I 
> would be seriously interested in the results, especially if some user-space 
> KM is involved as well. I only have some limited (lab environment) 
> experience with racoon, which is unable to handle more than a couple 
> hundred SAs when running on Linux.

I am planning to go to at least 100K SAs ;-> I am also quiet curious.
So far i have stopped at about 100.

> > The trick of padding the replay values on the backup node could still be
> > used in addition to the timer but only a small padding is needed.
> > In any case, this mechanism is still needed/valuable.
> 
>   Indeed, but this value depends on whether or not the user-space is clever 
> enough to use it. Let's suppose it will be. :)

I do in the code i am testing with at the moment. I havent been testing
a lot of corner cases - and so far havent needed any padding; but will
let you know how it goes. In any case, lets agree we need it. Whoever
feels brave enough to do without could.

cheers,
jamal

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-01-31 Thread KOVACS Krisztian

  Hi,

On Monday 30 January 2006 22:33, jamal wrote:
> >   We implemented partial ISAKMP SA synchronization in racoon. That
> Unfortunately this would also mean dependency on racoon. Is there any
> other way to do it without having to change racoon? example the phase1
> scripts or racoonctl?
> It seems to me that the only useful runtime parameter really - dont
> know how you would extract this without changing racoon - is peer/local
> cookies, no? From that one should be able to generate the SAs.

  Not really IMHO. You definitely need the shared secret associated with 
that SA, DPD state, etc.

  But what about leaving this alone for now, I think the very first step 
should be something like OpenBSD's sasyncd, which absolutely does not 
care about proper ISAKMP synchronization. We can think about these things 
later, if all the kernel-level things are settled.

> >   Indeed, but this value depends on whether or not the user-space is
> > clever enough to use it. Let's suppose it will be. :)
>
> I do in the code i am testing with at the moment. I havent been testing
> a lot of corner cases - and so far havent needed any padding; but will
> let you know how it goes. In any case, lets agree we need it. Whoever
> feels brave enough to do without could.

  OK.

-- 
 Regards,
  Krisztian Kovacs
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-02-01 Thread jamal
Hi Krisztian,

On Tue, 2006-31-01 at 22:37 +0100, KOVACS Krisztian wrote:
>   Hi,

>   But what about leaving this alone for now, I think the very first step 
> should be something like OpenBSD's sasyncd, which absolutely does not 
> care about proper ISAKMP synchronization. We can think about these things 
> later, if all the kernel-level things are settled.
> 

Sounds good.
I am going to repost the patches later on after some testing taking
into consideration Patrick's netlink patch as well as some of Herberts
changes.

cheers,
jamal


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-02-19 Thread YOSHIFUJI Hideaki / 吉藤英明
In article <[EMAIL PROTECTED]> (at Fri, 27 Jan 2006 08:05:23 -0500), jamal 
<[EMAIL PROTECTED]> says:

> diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
> index 82fbb75..b54a129 100644
> --- a/include/linux/xfrm.h
> +++ b/include/linux/xfrm.h
:
> @@ -235,6 +258,11 @@ struct xfrm_usersa_id {
>   __u8proto;
>  };
>  
> +struct xfrm_aevent_id {
> + __u32   flags;
> + struct xfrm_usersa_id   sa_id;
> +};
> +
>  struct xfrm_userspi_info {
>   struct xfrm_usersa_info info;
>   __u32   min;
:

Maybe paranoid, but anyway...

Jamal, is this okay on 64/32 biarch, which means, there are no differenes
between 32bit arch and 64bit arch?
IMHO, because of alignment, it is better to swap flags and sa_id,
isn't it?

--yoshfuji
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-02-19 Thread David S. Miller
From: YOSHIFUJI Hideaki <[EMAIL PROTECTED]>
Date: Mon, 20 Feb 2006 14:29:47 +0900 (JST)

> > +struct xfrm_aevent_id {
> > +   __u32   flags;
> > +   struct xfrm_usersa_id   sa_id;
> > +};
> > +
> >  struct xfrm_userspi_info {
> > struct xfrm_usersa_info info;
> > __u32   min;
> :
> 
> Maybe paranoid, but anyway...
> 
> Jamal, is this okay on 64/32 biarch, which means, there are no differenes
> between 32bit arch and 64bit arch?
> IMHO, because of alignment, it is better to swap flags and sa_id,
> isn't it?

I agree, this needs serious consideration.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-02-20 Thread David S. Miller
From: jamal <[EMAIL PROTECTED]>
Date: Mon, 20 Feb 2006 08:10:44 -0500

> Explain the rules to me: is it because the alignment in xfrm_usersa_id
> may change in the future?

Alignment on x86 of u64 is different from x86_64 and ia64,
so we must be extremely careful else we will have to
translate these structures going in and out of compat
processes, which is beyond painful.

So we should ensure the structure layout and alignment
is identical on all systems, 32-bit or 64-bit, in order
to avoid these problems.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch 1/6] IPSEC: core updates

2006-02-21 Thread jamal
On Mon, 2006-20-02 at 15:05 -0800, David S. Miller wrote:
> From: jamal <[EMAIL PROTECTED]>
> Date: Mon, 20 Feb 2006 08:10:44 -0500
> 
> > Explain the rules to me: is it because the alignment in xfrm_usersa_id
> > may change in the future?
> 
> Alignment on x86 of u64 is different from x86_64 and ia64,
> so we must be extremely careful else we will have to
> translate these structures going in and out of compat
> processes, which is beyond painful.
> 
> So we should ensure the structure layout and alignment
> is identical on all systems, 32-bit or 64-bit, in order
> to avoid these problems.

Ok. Patch attached against net-2617

Yoshfuji-san you should probably write a little doc that should be
available in the Doc/ directory.

cheers,
jamal

--

struct xfrm_aevent_id needs to be 32-bit + 64-bit align friendly.

Signed-off-by: Jamal Hadi Salim <[EMAIL PROTECTED]>
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index b54a129..6b42cc4 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -259,8 +259,8 @@ struct xfrm_usersa_id {
 };
 
 struct xfrm_aevent_id {
-	__u32flags;
 	struct xfrm_usersa_id		sa_id;
+	__u32flags;
 };
 
 struct xfrm_userspi_info {


Re: [Patch 1/6] IPSEC: core updates

2006-02-23 Thread David S. Miller
From: jamal <[EMAIL PROTECTED]>
Date: Tue, 21 Feb 2006 08:31:49 -0500

> Ok. Patch attached against net-2617
> 
> Yoshfuji-san you should probably write a little doc that should be
> available in the Doc/ directory.

If we write this, please ask Andi Kleen to review it.
His arch has the most problems in this area making him
an expert on this topic :-)

> struct xfrm_aevent_id needs to be 32-bit + 64-bit align friendly.
> 
> Signed-off-by: Jamal Hadi Salim <[EMAIL PROTECTED]>

Applied, thanks everyone.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html