[PATCHv3 1/3] NET_SCHED: PSPacer qdisc module

2007-11-28 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
module, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   29 ++
 net/sched/Kconfig |9 +
 net/sched/Makefile|1 +
 net/sched/sch_psp.c   |  962 +
 4 files changed, 1001 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 919af93..d2c5da1 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,35 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;   /* bytes/sec */
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;   /* bytes/sec */
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9c15c48..ec40e43 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -184,6 +184,15 @@ config NET_SCH_DSMARK
  To compile this code as a module, choose M here: the
  module will be called sch_dsmark.
 
+config NET_SCH_PSP
+   tristate "Precise Software Pacer (PSP)"
+   ---help---
+ Say Y here if you want to include PSPacer module, which means
+ that you will be able to control precise pacing.
+
+ To compile this driver as a module, choose M here: the
+ module will be called sch_psp.
+
 config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 81ecbe8..85425c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
 obj-$(CONFIG_NET_SCH_ATM)  += sch_atm.o
+obj-$(CONFIG_NET_SCH_PSP)  += sch_psp.o
 obj-$(CONFIG_NET_SCH_NETEM)+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)  += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
diff --git a/net/sched/sch_psp.c b/net/sched/sch_psp.c
new file mode 100644
index 000..620a224
--- /dev/null
+++ b/net/sched/sch_psp.c
@@ -0,0 +1,962 @@
+/*
+ * net/sched/sch_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * PSPacer achieves precise rate regulation results, and no microscopic
+ * burst transmission which exceeds the limit is generated.
+ *
+ * The basic idea is that transmission timing can be precisely controlled,
+ * if packets are sent back-to-back at the wire rate.  PSPacer controls
+ * the packet transmision intervals by inserting additional packets,
+ * called gap packets, between adjacent packets.  The transmission interval
+ * can be controlled accurately by adjusting the number and size of the gap
+ * packets. PSPacer uses the 802.3x PAUSE frame as the gap packet.
+ *
+ * For the purpose of adjusting the gap size, this Qdisc maintains a byte
+ * clock which is recorded by a total transmitted byte per connection.
+ * Each sub-class has a class local clock which is used to make decision
+ * whether to send a packet or not.  If there is not any packets to send,
+ * gap packets are inserted.
+ *
+ * References:
+ * [1] R.Takano, T.Kudoh, Y.Kodama, M.Matsuda, H.Tezuka, and Y.Ishikawa,
+ * "Design and Evaluation of Precise Software Pacing Mechanisms for
+ * Fast Long-Distance Networks", PFLDnet2005.
+ * [2] http://www.gridmpi.org/gridtcp.jsp
+ */
+
+#define HW_GAP (16)/* Preamble(8) + Inter Frame Gap(8) */
+#define FCS(4) /* Frame Check Sequence(4) */
+#define MIN_GAP (64)   /* Minimum size of gap packet */
+#define MIN_TARGET_RATE (1000) /* 1 KBytes/sec */
+
+#define PSP_HSIZE (16)
+
+struct psp_class
+{
+   u32 classid;/* class id */
+   int refcnt; /* reference count */
+
+   struct gnet_stats_basi

[PATCHv3 1/3] NET_SCHED: PSPacer qdisc module

2007-11-28 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
module, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   29 ++
 net/sched/Kconfig |9 +
 net/sched/Makefile|1 +
 net/sched/sch_psp.c   |  962 +
 4 files changed, 1001 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 919af93..d2c5da1 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,35 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;   /* bytes/sec */
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;   /* bytes/sec */
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9c15c48..ec40e43 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -184,6 +184,15 @@ config NET_SCH_DSMARK
  To compile this code as a module, choose M here: the
  module will be called sch_dsmark.
 
+config NET_SCH_PSP
+   tristate "Precise Software Pacer (PSP)"
+   ---help---
+ Say Y here if you want to include PSPacer module, which means
+ that you will be able to control precise pacing.
+
+ To compile this driver as a module, choose M here: the
+ module will be called sch_psp.
+
 config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 81ecbe8..85425c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
 obj-$(CONFIG_NET_SCH_ATM)  += sch_atm.o
+obj-$(CONFIG_NET_SCH_PSP)  += sch_psp.o
 obj-$(CONFIG_NET_SCH_NETEM)+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)  += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
diff --git a/net/sched/sch_psp.c b/net/sched/sch_psp.c
new file mode 100644
index 000..620a224
--- /dev/null
+++ b/net/sched/sch_psp.c
@@ -0,0 +1,962 @@
+/*
+ * net/sched/sch_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * PSPacer achieves precise rate regulation results, and no microscopic
+ * burst transmission which exceeds the limit is generated.
+ *
+ * The basic idea is that transmission timing can be precisely controlled,
+ * if packets are sent back-to-back at the wire rate.  PSPacer controls
+ * the packet transmision intervals by inserting additional packets,
+ * called gap packets, between adjacent packets.  The transmission interval
+ * can be controlled accurately by adjusting the number and size of the gap
+ * packets. PSPacer uses the 802.3x PAUSE frame as the gap packet.
+ *
+ * For the purpose of adjusting the gap size, this Qdisc maintains a byte
+ * clock which is recorded by a total transmitted byte per connection.
+ * Each sub-class has a class local clock which is used to make decision
+ * whether to send a packet or not.  If there is not any packets to send,
+ * gap packets are inserted.
+ *
+ * References:
+ * [1] R.Takano, T.Kudoh, Y.Kodama, M.Matsuda, H.Tezuka, and Y.Ishikawa,
+ * "Design and Evaluation of Precise Software Pacing Mechanisms for
+ * Fast Long-Distance Networks", PFLDnet2005.
+ * [2] http://www.gridmpi.org/gridtcp.jsp
+ */
+
+#define HW_GAP (16)/* Preamble(8) + Inter Frame Gap(8) */
+#define FCS(4) /* Frame Check Sequence(4) */
+#define MIN_GAP (64)   /* Minimum size of gap packet */
+#define MIN_TARGET_RATE (1000) /* 1 KBytes/sec */
+
+#define PSP_HSIZE (16)
+
+struct psp_class
+{
+   u32 classid;/* class id */
+   int refcnt; /* reference count */
+
+   struct gnet_stats_basi

[PATCHv3 2/3] TC: PSPacer qdisc module

2007-11-28 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
tc part, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   29 +++
 tc/Makefile   |1 +
 tc/q_psp.c|  199 +
 3 files changed, 229 insertions(+), 0 deletions(-)
 create mode 100644 tc/q_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 268c515..be7b466 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,35 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;   /* bytes/sec */
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;   /* bytes/sec */
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/tc/Makefile b/tc/Makefile
index a715566..836df9d 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -12,6 +12,7 @@ TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o
 TCMODULES += q_rr.o
+TCMODULES += q_psp.o
 TCMODULES += q_netem.o
 TCMODULES += f_rsvp.o
 TCMODULES += f_u32.o
diff --git a/tc/q_psp.c b/tc/q_psp.c
new file mode 100644
index 000..db5d42d
--- /dev/null
+++ b/tc/q_psp.c
@@ -0,0 +1,199 @@
+/*
+ * q_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+   fprintf(stderr,
+"Usage: ... qdisc add ... psp [ default N ] [rate RATE]\n"
+" default  minor id of class to which unclassified packets are sent {0}\n"
+" rate physical interface bandwidth\n\n"
+"... class add ... psp mode M [ rate MBPS ]\n"
+" mode target rate estimation method (NORMAL=0 STATIC=1) {0}\n"
+" rate rate allocated to this class\n");
+}
+
+static void explain1(char *arg)
+{
+   fprintf(stderr, "Illegal \"%s\"\n", arg);
+   explain();
+}
+
+
+static int psp_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+struct nlmsghdr *n)
+{
+   struct tc_psp_qopt qopt;
+   struct rtattr *tail;
+   memset(&qopt, 0, sizeof(qopt));
+
+   while (argc > 0) {
+   if (matches(*argv, "rate") == 0) {
+   NEXT_ARG();
+   if (get_rate(&qopt.rate, *argv)) {
+   explain1("rate");
+   return -1;
+   }
+   } else if (matches(*argv, "default") == 0) {
+   NEXT_ARG();
+   if (get_u32(&qopt.defcls, *argv, 16)) {
+   explain1("default");
+   return -1;
+   }
+   } else if (matches(*argv, "help") == 0) {
+   explain();
+   return -1;
+   } else {
+   fprintf(stderr, "What is \"%s\"?\n", *argv);
+   explain();
+   return -1;
+   }
+   argc--;
+   argv++;
+   }
+
+   tail = NLMSG_TAIL(n);
+   addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+   addattr_l(n, 2024, TCA_OPTIONS, &qopt, NLMSG_ALIGN(sizeof(qopt)));
+   tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+   return 0;
+}
+
+static int psp_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+   struct rtattr *tb[TCA_PSP_QOPT+1];
+   struct tc_psp_copt *copt;
+   struct tc_psp_qopt *qopt;
+   SPRINT_BUF(b);
+
+   if (opt == NULL)
+   return 0;
+
+   memset(tb, 0, sizeof(tb));
+   parse_rtattr_nested(tb, TCA_PSP_QOPT, opt);
+
+   if (tb[TCA_PSP_COPT]) {
+   copt = RTA_DATA(tb[TCA_PSP_COPT]);
+   if (RTA_PAY

[PATCHv3 3/3] TC: PSPacer man page

2007-11-28 Thread Ryousei Takano
This patch includes the man page of the PSPacer (Precise Software
Pacing) qdisc module.

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 man/man8/tc-psp.8 |  166 +
 1 files changed, 166 insertions(+), 0 deletions(-)
 create mode 100644 man/man8/tc-psp.8

diff --git a/man/man8/tc-psp.8 b/man/man8/tc-psp.8
new file mode 100644
index 000..a6e26bf
--- /dev/null
+++ b/man/man8/tc-psp.8
@@ -0,0 +1,166 @@
+.TH PSP 8 "13 October 2007" "iproute2" "Linux"
+.SH NAME
+PSP \- Precise Software Pacer
+.SH SYNOPSIS
+.B tc qdisc ... dev
+dev
+.B  ( parent
+classid 
+.B | root) [ handle 
+major: 
+.B ] psp [ default 
+minor-id
+.B ] [ rate
+rate
+.B ] 
+
+.B tc class ... dev
+dev
+.B parent 
+major:[minor]
+.B [ classid 
+major:minor
+.B ] psp rate
+rate
+.B ] [ mode 
+mode
+.B ] 
+
+.SH DESCRIPTION
+Precise Software Pacer (PSPacer) is a classful queuing discipline 
+which controls traffic with
+.BR tc (8)
+command.
+PSP achieves a precise pacing per class.
+
+.SH GAP PACKET
+The key to realizing precise pacing is to control the starting time of 
+the transmission of each packet.  We propose a simple yet accurate 
+mechanism to trigger the transmission of a packet.  That is, to insert 
+a gap packet between the real packets.  The gap packet produces a gap 
+between sequentially transmitted real packets.
+We employ a PAUSE packet as a gap packet.  A PAUSE packet is defined in 
+the IEEE 802.3x flow control.
+
+By changing the gap packet size, the starting time of 
+the next real packet transmission can be precisely controlled.
+For example, to control a half rate transmission, a gap packet is inserted 
+between every real packet where the gap packet size is the same as 
+that of the real packets.
+
+.SH BYTE CLOCK SCHEDULING
+Packet transmission is scheduled based on the inter-packet gap of each 
+class (i.e. target rate).
+If the network has multiple bottleneck links, it is necessary to 
+schedule the order of packet transmission and the packet interval.  
+
+PSPacer maintains a virtual clock which is counted by the total transmitted 
+byte instead of real time clock.  Each sub-class has its local clock 
+which is used to make decision whether to send a packet or not.
+If there is an idle time, a gap packet is inserted.
+
+.SH CLASSIFICATION
+Within one PSP instance, many classes may exist. Each of these classes
+contains its own qdisc.
+
+When enqueuing a packet, PSP starts at the root and uses various methods to 
+determine which class should be used to obtain the data to be enqueued. 
+
+In the standard configuration, this process is rather easy. 
+At each node we look for an instruction, and then go to the class the 
+instruction refers to. If the class found is a leaf-node (without 
+children), we enqueue the packet there. If it is not yet a leaf node, we do 
+the same thing over again starting from that node. 
+
+The following actions are performed in order at each node we visit, until 
+move to another node, or terminates the process.
+.TP
+(i)
+Consult filters attached to the class. If we are at a leaf node, we are done. 
+Otherwise, restart.
+.TP
+(ii)
+If none of the above returned with an instruction, send to the default class.
+.P
+./ This algorithm makes sure that a packet always ends up somewhere, even while
+./ you are busy building your configuration. 
+
+.SH QDISC
+The root of a PSP qdisc class tree has the following parameters:
+
+.TP 
+parent major:minor | root
+This mandatory parameter determines the place of the PSP instance, 
+either at the
+.B root
+of an interface or within an existing class.
+.TP
+handle major:
+Like all other qdiscs, the PSP can be assigned a handle. It should consist only
+of a major number, followed by a colon. Optional, but it is very useful 
+if classes will be generated within this qdisc.
+.TP 
+default minor-id
+Unclassified traffic is sent to the class with this minor-id.
+.TP
+rate rate
+Optional.  You can explicitly specify the maximum transmission rate.
+For example, if a 33MHz/32bit PCI bus is used to connect a Gigabit 
+Ethernet network interface, the bottleneck is the PCI bus, and the 
+system can not transmit packets at the rate of gigabit/sec. 
+
+.SH CLASSES
+Classes have a host of parameters to configure their operation.
+
+.TP 
+parent major:minor
+Specifies the place of this class within the hierarchy. If attached directly 
+to a qdisc and not to another class, minor can be omitted. Mandatory.
+.TP 
+classid major:minor
+Like qdiscs, classes can be named. The major number must be equal to the
+major number of the qdisc to which it belongs. Optional, but needed if this 
+class is going to have children.
+.TP 
+rate rate
+Maximum transmission rate this class including all its children are assigned. 
+Optional, but required if this class is set to mode 1 (static target rate).
+.TP
+mode mode
+Range from 0 to 1.  The mode 0 is without pacing.  The mode 1 is
+pacing ba

[PATCHv3 net-2.6.25 0/3] PSPacer qdisc module

2007-11-28 Thread Ryousei Takano
Hi all,

This is the 3rd version of PSPacer patches.

PSPacer (Precise Software Pacer) is a qdisc module which realizes 
precise transmission bandwidth control. It makes bursty traffic which is 
often generated by TCP smooth without any special hardware.
For your information, please see my previous post:
http://marc.info/?l=linux-netdev&m=119570861526290&w=2

Changes (v3):
* fixed alloc_gap_packet().
* fixed recalc_gapsize().
* changed the type of 'clock' from gapclock_t to u64.
* changed rate/allocated_rate/gapsize variables from u32 to u64.
* changed from tc_psp_xstats to gnet_stats_basic.
* added checking whether it is a root qdisc in psp_init().
* fixed qdisc_tree_decrease_qlen stuff.
* removed psp_change_qdisc().
* fixed coding style.

Changes (v2):
* checked by the checkpatch.pl script.
* introduced struct gaphdr.
* removed the HTB-way of using a "direct class".
* removed unnecessary skb_reserve() and magic values in alloc_gap_packet().
* added a proper check when skb_clone() fails in psp_dequeue().
* used qdisc_tree_decrease_qlen() in psp_graft().

Usage:
# tc qdisc add dev eth0 root handle 1: psp default 1
# tc class add dev eth0 parent 1: classid 1:1 psp rate 500mbit
# tc qdisc add dev eth0 parent 1:1 handle 10: pfifo

Patches:
[1/3] PSPacer kernel part
[2/3] PSPacer tc part
[3/3] PSPacer tc man page

Best regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module

2007-11-27 Thread Ryousei Takano
Hi Patrick,

> >>> +struct tc_psp_qopt
> >>> +{
> >>> + __u32   defcls;
> >>> + __u32   rate;
> >>> +};
> >>
> >> What unit is rate measured in?
> >>
> > 'rate' is the transmission rate in bytes per sec.
> 
> 
> So wouldn't it make sense to use u64 then for 10GBit networks?
> 
I decided to use u32 after tc_ratespec.rate is u32.
u32 is large enough for 10Gbit networks, but small for 40Gbit.
I will use u64, because code becomes simple and consistent.


> >>> + skb_put(skb, size);
> >> This is usually done before putting data in the packet.
> >>
> > Therefore, skb_put() is needed.
> 
> 
> I meant this is usually done before writing to the packet data,
> so you should move it up a few lines.
> 
I am silly:-) I understood. Thanks.


> >>> + while (!list_empty(&q->root))
> >>> + psp_destroy_class(sch, list_entry(q->root.next,
> >>> +   struct psp_class, sibling));
> >> list_for_each_entry_safe.
> >>
> > I think it works well. Should I need to use list_for_each_entry_safe?
> 
> 
> I don't doubt that it works, but list_for_each_entry_safe is
> the proper interface for this.
> 
I will fix it. Thanks.

Best regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module

2007-11-27 Thread Ryousei Takano
> One more thing: your qdisc can only be used as root qdisc since it
> produces packets itself and thereby violates the rule that a qdisc
> can only hand out packets that were previously enqueued to it.
> Using it as a leaf qdisc can make the upper qdiscs qlen counter
> go negative, causing infinite dequeue-loops, so you should make
> sure that its only possibly to use as root qdisc by checking the
> parent. It would also be better to do something like netem
> (enqueue produced packets at the root) to make sure the qlen
> counter is always accurate.
> 
I agree with you.
PSPacer should not use with other rate regulation qdiscs. But, 
I think that a combination of netem and PSPacer is a useful for
emulating networks. The following paper describes experimental
results using PSPacer with netem:

   "Large Scale Gigabit Emulated Testbed for Grid Transport
   Evaluation", PFLDnet 2006.
   http://www.hpcc.jp/pfldnet2006/paper/s1_02.pdf

Best regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2 2/3] TC: PSPacer qdisc module

2007-11-26 Thread Ryousei Takano
I am sorry I sent an old patch.
Please see this one.

--
[PATCHv2 2/3] TC: PSPacer qdisc module

This patch includes the PSPacer (Precise Software Pacer) qdisc
tc part, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   37 +
 tc/Makefile   |1 +
 tc/q_psp.c|  199 +
 3 files changed, 237 insertions(+), 0 deletions(-)
 create mode 100644 tc/q_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 268c515..ed21e26 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,43 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+typedef long long gapclock_t;
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;
+};
+
+struct tc_psp_xstats
+{
+   __u32   bytes;  /* gap packet statistics */
+   __u32   packets;
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/tc/Makefile b/tc/Makefile
index a715566..836df9d 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -12,6 +12,7 @@ TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o
 TCMODULES += q_rr.o
+TCMODULES += q_psp.o
 TCMODULES += q_netem.o
 TCMODULES += f_rsvp.o
 TCMODULES += f_u32.o
diff --git a/tc/q_psp.c b/tc/q_psp.c
new file mode 100644
index 000..1806b66
--- /dev/null
+++ b/tc/q_psp.c
@@ -0,0 +1,199 @@
+/*
+ * q_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+   fprintf(stderr,
+"Usage: ... qdisc add ... psp [ default N ] [rate RATE]\n"
+" default  minor id of class to which unclassified packets are sent {0}\n"
+" rate physical interface bandwidth\n\n"
+"... class add ... psp mode M [ rate MBPS ]\n"
+" mode target rate estimation method (NORMAL=0 STATIC=1) {0}\n"
+" rate rate allocated to this class\n");
+}
+
+static void explain1(char *arg)
+{
+   fprintf(stderr, "Illegal \"%s\"\n", arg);
+   explain();
+}
+
+
+static int psp_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+struct nlmsghdr *n)
+{
+   struct tc_psp_qopt qopt;
+   struct rtattr *tail;
+   memset(&qopt, 0, sizeof(qopt));
+
+   while (argc > 0) {
+   if (matches(*argv, "rate") == 0) {
+   NEXT_ARG();
+   if (get_rate(&qopt.rate, *argv)) {
+   explain1("rate");
+   return -1;
+   }
+   } else if (matches(*argv, "default") == 0) {
+   NEXT_ARG();
+   if (get_u32(&qopt.defcls, *argv, 16)) {
+   explain1("default");
+   return -1;
+   }
+   } else if (matches(*argv, "help") == 0) {
+   explain();
+   return -1;
+   } else {
+   fprintf(stderr, "What is \"%s\"?\n", *argv);
+   explain();
+   return -1;
+   }
+   argc--;
+   argv++;
+   }
+
+   tail = NLMSG_TAIL(n);
+   addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+   addattr_l(n, 2024, TCA_OPTIONS, &qopt, NLMSG_ALIGN(sizeof(qopt)));
+   tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+   return 0;
+}
+
+static int psp_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+   struct rtattr *tb[TCA_PSP_QOPT+1];
+   struct tc_psp_copt *copt;
+   struct tc_psp_qopt *qopt;
+   SPRINT_BUF(b);
+
+   if (opt == NULL)
+   return 0;
+
+   memset(

[PATCHv2 2/3] TC: PSPacer qdisc module

2007-11-26 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
tc part, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   38 +
 tc/Makefile   |1 +
 tc/q_psp.c|  199 +
 3 files changed, 238 insertions(+), 0 deletions(-)
 create mode 100644 tc/q_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 268c515..c708082 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,44 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+typedef long long gapclock_t;
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;
+   __u32   direct_pkts;
+};
+
+struct tc_psp_xstats
+{
+   __u32   bytes;  /* gap packet statistics */
+   __u32   packets;
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/tc/Makefile b/tc/Makefile
index a715566..836df9d 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -12,6 +12,7 @@ TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o
 TCMODULES += q_rr.o
+TCMODULES += q_psp.o
 TCMODULES += q_netem.o
 TCMODULES += f_rsvp.o
 TCMODULES += f_u32.o
diff --git a/tc/q_psp.c b/tc/q_psp.c
new file mode 100644
index 000..1806b66
--- /dev/null
+++ b/tc/q_psp.c
@@ -0,0 +1,199 @@
+/*
+ * q_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+   fprintf(stderr,
+"Usage: ... qdisc add ... psp [ default N ] [rate RATE]\n"
+" default  minor id of class to which unclassified packets are sent {0}\n"
+" rate physical interface bandwidth\n\n"
+"... class add ... psp mode M [ rate MBPS ]\n"
+" mode target rate estimation method (NORMAL=0 STATIC=1) {0}\n"
+" rate rate allocated to this class\n");
+}
+
+static void explain1(char *arg)
+{
+   fprintf(stderr, "Illegal \"%s\"\n", arg);
+   explain();
+}
+
+
+static int psp_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+struct nlmsghdr *n)
+{
+   struct tc_psp_qopt qopt;
+   struct rtattr *tail;
+   memset(&qopt, 0, sizeof(qopt));
+
+   while (argc > 0) {
+   if (matches(*argv, "rate") == 0) {
+   NEXT_ARG();
+   if (get_rate(&qopt.rate, *argv)) {
+   explain1("rate");
+   return -1;
+   }
+   } else if (matches(*argv, "default") == 0) {
+   NEXT_ARG();
+   if (get_u32(&qopt.defcls, *argv, 16)) {
+   explain1("default");
+   return -1;
+   }
+   } else if (matches(*argv, "help") == 0) {
+   explain();
+   return -1;
+   } else {
+   fprintf(stderr, "What is \"%s\"?\n", *argv);
+   explain();
+   return -1;
+   }
+   argc--;
+   argv++;
+   }
+
+   tail = NLMSG_TAIL(n);
+   addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+   addattr_l(n, 2024, TCA_OPTIONS, &qopt, NLMSG_ALIGN(sizeof(qopt)));
+   tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+   return 0;
+}
+
+static int psp_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+   struct rtattr *tb[TCA_PSP_QOPT+1];
+   struct tc_psp_copt *copt;
+   struct tc_psp_qopt *qopt;
+   SPRINT_BUF(b);
+
+   if (opt == NULL)
+   return 0;
+
+   memset(tb, 0, sizeof(tb));
+   parse_rtattr_nested(tb, T

[PATCHv2 3/3] TC: PSPacer man page

2007-11-26 Thread Ryousei Takano
This patch includes the man page of the PSPacer (Precise Software
Pacing) qdisc module.

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 man/man8/tc-psp.8 |  166 +
 1 files changed, 166 insertions(+), 0 deletions(-)
 create mode 100644 man/man8/tc-psp.8

diff --git a/man/man8/tc-psp.8 b/man/man8/tc-psp.8
new file mode 100644
index 000..a6e26bf
--- /dev/null
+++ b/man/man8/tc-psp.8
@@ -0,0 +1,166 @@
+.TH PSP 8 "13 October 2007" "iproute2" "Linux"
+.SH NAME
+PSP \- Precise Software Pacer
+.SH SYNOPSIS
+.B tc qdisc ... dev
+dev
+.B  ( parent
+classid 
+.B | root) [ handle 
+major: 
+.B ] psp [ default 
+minor-id
+.B ] [ rate
+rate
+.B ] 
+
+.B tc class ... dev
+dev
+.B parent 
+major:[minor]
+.B [ classid 
+major:minor
+.B ] psp rate
+rate
+.B ] [ mode 
+mode
+.B ] 
+
+.SH DESCRIPTION
+Precise Software Pacer (PSPacer) is a classful queuing discipline 
+which controls traffic with
+.BR tc (8)
+command.
+PSP achieves a precise pacing per class.
+
+.SH GAP PACKET
+The key to realizing precise pacing is to control the starting time of 
+the transmission of each packet.  We propose a simple yet accurate 
+mechanism to trigger the transmission of a packet.  That is, to insert 
+a gap packet between the real packets.  The gap packet produces a gap 
+between sequentially transmitted real packets.
+We employ a PAUSE packet as a gap packet.  A PAUSE packet is defined in 
+the IEEE 802.3x flow control.
+
+By changing the gap packet size, the starting time of 
+the next real packet transmission can be precisely controlled.
+For example, to control a half rate transmission, a gap packet is inserted 
+between every real packet where the gap packet size is the same as 
+that of the real packets.
+
+.SH BYTE CLOCK SCHEDULING
+Packet transmission is scheduled based on the inter-packet gap of each 
+class (i.e. target rate).
+If the network has multiple bottleneck links, it is necessary to 
+schedule the order of packet transmission and the packet interval.  
+
+PSPacer maintains a virtual clock which is counted by the total transmitted 
+byte instead of real time clock.  Each sub-class has its local clock 
+which is used to make decision whether to send a packet or not.
+If there is an idle time, a gap packet is inserted.
+
+.SH CLASSIFICATION
+Within one PSP instance, many classes may exist. Each of these classes
+contains its own qdisc.
+
+When enqueuing a packet, PSP starts at the root and uses various methods to 
+determine which class should be used to obtain the data to be enqueued. 
+
+In the standard configuration, this process is rather easy. 
+At each node we look for an instruction, and then go to the class the 
+instruction refers to. If the class found is a leaf-node (without 
+children), we enqueue the packet there. If it is not yet a leaf node, we do 
+the same thing over again starting from that node. 
+
+The following actions are performed in order at each node we visit, until 
+move to another node, or terminates the process.
+.TP
+(i)
+Consult filters attached to the class. If we are at a leaf node, we are done. 
+Otherwise, restart.
+.TP
+(ii)
+If none of the above returned with an instruction, send to the default class.
+.P
+./ This algorithm makes sure that a packet always ends up somewhere, even while
+./ you are busy building your configuration. 
+
+.SH QDISC
+The root of a PSP qdisc class tree has the following parameters:
+
+.TP 
+parent major:minor | root
+This mandatory parameter determines the place of the PSP instance, 
+either at the
+.B root
+of an interface or within an existing class.
+.TP
+handle major:
+Like all other qdiscs, the PSP can be assigned a handle. It should consist only
+of a major number, followed by a colon. Optional, but it is very useful 
+if classes will be generated within this qdisc.
+.TP 
+default minor-id
+Unclassified traffic is sent to the class with this minor-id.
+.TP
+rate rate
+Optional.  You can explicitly specify the maximum transmission rate.
+For example, if a 33MHz/32bit PCI bus is used to connect a Gigabit 
+Ethernet network interface, the bottleneck is the PCI bus, and the 
+system can not transmit packets at the rate of gigabit/sec. 
+
+.SH CLASSES
+Classes have a host of parameters to configure their operation.
+
+.TP 
+parent major:minor
+Specifies the place of this class within the hierarchy. If attached directly 
+to a qdisc and not to another class, minor can be omitted. Mandatory.
+.TP 
+classid major:minor
+Like qdiscs, classes can be named. The major number must be equal to the
+major number of the qdisc to which it belongs. Optional, but needed if this 
+class is going to have children.
+.TP 
+rate rate
+Maximum transmission rate this class including all its children are assigned. 
+Optional, but required if this class is set to mode 1 (static target rate).
+.TP
+mode mode
+Range from 0 to 1.  The mode 0 is without pacing.  The mode 1 is
+pacing ba

[PATCHv2 net-2.6.25 0/3] PSPacer qdisc module

2007-11-26 Thread Ryousei Takano
Hi all,

This is the 2nd version of PSPacer patches.

PSPacer (Precise Software Pacer) is a qdisc module which realizes 
precise transmission bandwidth control. It makes bursty traffic which is 
often generated by TCP smooth without any special hardware.
For your information, please see my previous post:
http://marc.info/?l=linux-netdev&m=119570861526290&w=2

Changes:
* checked by the checkpatch.pl script.
* introduced struct gaphdr.
* removed the HTB-way of using a "direct class".
* removed unnecessary skb_reserve() and magic values in alloc_gap_packet().
* added a proper check when skb_clone() fails in psp_dequeue().
* used qdisc_tree_decrease_qlen() in psp_graft().

Usage:
# tc qdisc add dev eth0 root handle 1: psp default 1
# tc class add dev eth0 parent 1: classid 1:1 psp rate 500mbit
# tc qdisc add dev eth0 parent 1:1 handle 10: pfifo

Patches:
[1/3] PSPacer kernel part
[2/3] PSPacer tc part
[3/3] PSPacer tc man page

Best regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 1/3] NET_SCHED: PSPacer qdisc module

2007-11-26 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
module, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   37 ++
 net/sched/Kconfig |9 +
 net/sched/Makefile|1 +
 net/sched/sch_psp.c   |  958 +
 4 files changed, 1005 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 919af93..fda41cd 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,43 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+typedef long long gapclock_t;
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;
+};
+
+struct tc_psp_xstats
+{
+   __u32   bytes;  /* gap packet statistics */
+   __u32   packets;
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9c15c48..ec40e43 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -184,6 +184,15 @@ config NET_SCH_DSMARK
  To compile this code as a module, choose M here: the
  module will be called sch_dsmark.
 
+config NET_SCH_PSP
+   tristate "Precise Software Pacer (PSP)"
+   ---help---
+ Say Y here if you want to include PSPacer module, which means
+ that you will be able to control precise pacing.
+
+ To compile this driver as a module, choose M here: the
+ module will be called sch_psp.
+
 config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 81ecbe8..85425c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
 obj-$(CONFIG_NET_SCH_ATM)  += sch_atm.o
+obj-$(CONFIG_NET_SCH_PSP)  += sch_psp.o
 obj-$(CONFIG_NET_SCH_NETEM)+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)  += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
diff --git a/net/sched/sch_psp.c b/net/sched/sch_psp.c
new file mode 100644
index 000..f475b50
--- /dev/null
+++ b/net/sched/sch_psp.c
@@ -0,0 +1,958 @@
+/*
+ * net/sched/sch_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* PSPacer achieves precise rate regulation results, and no microscopic
+ * burst transmission which exceeds the limit is generated.
+ *
+ * The basic idea is that transmission timing can be precisely controlled,
+ * if packets are sent back-to-back at the wire rate.  PSPacer controls
+ * the packet transmision intervals by inserting additional packets,
+ * called gap packets, between adjacent packets.  The transmission interval
+ * can be controlled accurately by adjusting the number and size of the gap
+ * packets. PSPacer uses the 802.3x PAUSE frame as the gap packet.
+ *
+ * For the purpose of adjusting the gap size, this Qdisc maintains a byte
+ * clock which is recorded by a total transmitted byte per connection.
+ * Each sub-class has a class local clock which is used to make decision
+ * whether to send a packet or not.  If there is not any packets to send,
+ * gap packets are inserted.
+ *
+ * References:
+ * [1] R.Takano, T.Kudoh, Y.Kodama, M.Matsuda, H.Tezuka, and Y.Ishikawa,
+ * "Design and Evaluation of Precise Software Pacing Mechanisms for
+ * Fast Long-Distance Networks", PFLDnet2005.
+ * [2] http://www.gridmpi.org/gridtcp.jsp
+ */
+
+#define HW_GAP (16)/* Preamble(8) + Inter Frame Gap(8) */
+#define FCS(4) /* Frame Check Sequence(4) */
+#define MIN_GAP (64)   /* Minimum size of gap packet */
+#define MIN_TARGET_RATE (1000) /* 1 KB/s (= 8 Kbps) */
+
+#define PSP_HSIZE (16)
+
+#define BIT2BYTE(n) ((n) >> 3)
+
+struct psp_class
+{
+   u32 classid; 

[RFC][PATCH 0/3] PSPacer qdisc module

2007-11-21 Thread Ryousei Takano
 eth0 tso off).


Best regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH 1/3] NET_SCHED: PSPacer qdisc module

2007-11-21 Thread Ryousei Takano
Hi jamal,

> Good stuff.
> I have not read your paper - There are NICs out there (chelsio comes to
> mind) which claim to do pacing and have shown impressive numbers with
> TCP. Is your approach similar? Are there patents involved by some of
> these hardware vendors? (It would not be suprising if they exist).
> 
As far as I know, no. (I have not the details of chelsio NICs.)
Pacing is a general idea, but our approach of implementation is a unique.
PSPacer makes bursty traffic which is often generated by TCP smooth
without any special hardware.

> The advantage with NICs is they have very good control of the timing
> (clock granularity being extremely important in cases like this) - what
> were your measurements based on i.e what clock source did you use on
> Linux?

The key idea of PSPacer is to determine transmission timing of packets 
by the number of bytes transferred. If packets are transferred back to 
back, the timing a packet is sent can be determined by the number of 
bytes sent before the packet. PSPacer fills the gaps between time 
aligned "real packets" (the packets which are sent by user program) by 
"gap packets". The real packets and gap packets are sent back to back, 
and thus the timing of transmission of each real packet can be precisely 
controlled by adjusting the gap packet size. As the gap packets, the IEEE 
802.3x PAUSE frames are used. PAUSE frames are discarded at a switch 
input port, and only real packets go through the switch keeping the 
original intervals. 

In the past, some software-based pacing schemes have been proposed.
These schemes use timer interrupt based packet transmission timing control.
Therefore, to achieve precise pacing, they require the operating system
to maintain a high resolution timer, which could incur a large overhead.

> Also, the idea of using a PAUSE frame to add gaps is interesting, but
> you should note that in linux a qdisc may be attached to any network
> device and this for example maybe a PPP device etc. What would you use
> for gaps in that case? 


> I apologize if the answers are in your papers - i just glossed over.
> 
> cheers,
> jamal 
> 
Best regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: HTB/HSFC shaping precision

2007-11-21 Thread Ryousei Takano
Hi jamal and denys,

> > One message later, thats what i dreamed about :-)
> > Subject: [RFC][PATCH 1/3] NET_SCHED: PSPacer qdisc module
> > On website they have very good explanation...
> > http://www.gridmpi.org/gridtcp.jsp
>
> That looks interesting - without reading the papers a few questions are
> developing in my brain cells; for example it looks very similar to what
> the chelsio NICs claim to do (which could be a good thing for TCP).
> Whenever i see someone implementing something in hardware, i always get
> flushes of "patents".
>
Thanks for looking our web page.

PSPacer has quite accurate shaping precision.
The point is that special hardware like the chelsio NICs is not required of it.
PSPacer uses a gap packet, whose format is IEEE 802.3x pause frame,
to control the interval between outgoing packets.
As far as I know, it is a unique approach.

Best Regards,
Ryousei Takano
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH 2/3] TC: PSPacer qdisc module

2007-11-21 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
tc part, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   38 +
 tc/Makefile   |1 +
 tc/q_psp.c|  200 +
 3 files changed, 239 insertions(+), 0 deletions(-)
 create mode 100644 tc/q_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 268c515..c708082 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,44 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+typedef long long gapclock_t;
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;
+   __u32   direct_pkts;
+};
+
+struct tc_psp_xstats
+{
+   __u32   bytes;  /* gap packet statistics */
+   __u32   packets;
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/tc/Makefile b/tc/Makefile
index a715566..836df9d 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -12,6 +12,7 @@ TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o
 TCMODULES += q_rr.o
+TCMODULES += q_psp.o
 TCMODULES += q_netem.o
 TCMODULES += f_rsvp.o
 TCMODULES += f_u32.o
diff --git a/tc/q_psp.c b/tc/q_psp.c
new file mode 100644
index 000..e3f4cf7
--- /dev/null
+++ b/tc/q_psp.c
@@ -0,0 +1,200 @@
+/*
+ * q_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced 
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+   fprintf(stderr, 
+"Usage: ... qdisc add ... psp [ default N ] [rate RATE]\n"
+" default  minor id of class to which unclassified packets are sent {0}\n"
+" rate physical interface bandwidth\n\n"
+"... class add ... psp mode M [ rate MBPS ]\n"
+" mode target rate estimation method (NORMAL=0 STATIC=1) {0}\n"
+" rate rate allocated to this class\n");
+}
+
+static void explain1(char *arg)
+{
+   fprintf(stderr, "Illegal \"%s\"\n", arg);
+   explain();
+}
+
+
+static int psp_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+struct nlmsghdr *n)
+{
+   struct tc_psp_qopt qopt;
+   struct rtattr *tail;
+   memset(&qopt, 0, sizeof(qopt));
+
+   while (argc > 0) {
+   if (matches(*argv, "rate") == 0) {
+   NEXT_ARG();
+   if (get_rate(&qopt.rate, *argv)) {
+   explain1("rate");
+   return -1;
+   }
+   } else if (matches(*argv, "default") == 0) {
+   NEXT_ARG();
+   if (get_u32(&qopt.defcls, *argv, 16)) {
+   explain1("default");
+   return -1;
+   }
+   } else if (matches(*argv, "help") == 0) {
+   explain();
+   return -1;
+   } else {
+   fprintf(stderr, "What is \"%s\"?\n", *argv);
+   explain();
+   return -1;
+   }
+   argc--;
+   argv++;
+   }
+
+   tail = NLMSG_TAIL(n);
+   addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+   addattr_l(n, 2024, TCA_OPTIONS, &qopt, NLMSG_ALIGN(sizeof(qopt)));
+   tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+   return 0;
+}
+
+static int psp_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+   struct rtattr *tb[TCA_PSP_QOPT+1];
+   struct tc_psp_copt *copt;
+   struct tc_psp_qopt *qopt;
+   SPRINT_BUF(b);
+
+   if (opt == NULL)
+   return 0;
+
+   memset(tb, 0, sizeof(tb));
+   parse_rtattr_nested(tb, T

[RFC][PATCH 3/3] TC: PSPacer man page

2007-11-21 Thread Ryousei Takano
This patch includes the man page of the PSPacer (Precise Software
Pacing) qdisc module.

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 man/man8/tc-psp.8 |  166 +
 1 files changed, 166 insertions(+), 0 deletions(-)
 create mode 100644 man/man8/tc-psp.8

diff --git a/man/man8/tc-psp.8 b/man/man8/tc-psp.8
new file mode 100644
index 000..a6e26bf
--- /dev/null
+++ b/man/man8/tc-psp.8
@@ -0,0 +1,166 @@
+.TH PSP 8 "13 October 2007" "iproute2" "Linux"
+.SH NAME
+PSP \- Precise Software Pacer
+.SH SYNOPSIS
+.B tc qdisc ... dev
+dev
+.B  ( parent
+classid 
+.B | root) [ handle 
+major: 
+.B ] psp [ default 
+minor-id
+.B ] [ rate
+rate
+.B ] 
+
+.B tc class ... dev
+dev
+.B parent 
+major:[minor]
+.B [ classid 
+major:minor
+.B ] psp rate
+rate
+.B ] [ mode 
+mode
+.B ] 
+
+.SH DESCRIPTION
+Precise Software Pacer (PSPacer) is a classful queuing discipline 
+which controls traffic with
+.BR tc (8)
+command.
+PSP achieves a precise pacing per class.
+
+.SH GAP PACKET
+The key to realizing precise pacing is to control the starting time of 
+the transmission of each packet.  We propose a simple yet accurate 
+mechanism to trigger the transmission of a packet.  That is, to insert 
+a gap packet between the real packets.  The gap packet produces a gap 
+between sequentially transmitted real packets.
+We employ a PAUSE packet as a gap packet.  A PAUSE packet is defined in 
+the IEEE 802.3x flow control.
+
+By changing the gap packet size, the starting time of 
+the next real packet transmission can be precisely controlled.
+For example, to control a half rate transmission, a gap packet is inserted 
+between every real packet where the gap packet size is the same as 
+that of the real packets.
+
+.SH BYTE CLOCK SCHEDULING
+Packet transmission is scheduled based on the inter-packet gap of each 
+class (i.e. target rate).
+If the network has multiple bottleneck links, it is necessary to 
+schedule the order of packet transmission and the packet interval.  
+
+PSPacer maintains a virtual clock which is counted by the total transmitted 
+byte instead of real time clock.  Each sub-class has its local clock 
+which is used to make decision whether to send a packet or not.
+If there is an idle time, a gap packet is inserted.
+
+.SH CLASSIFICATION
+Within one PSP instance, many classes may exist. Each of these classes
+contains its own qdisc.
+
+When enqueuing a packet, PSP starts at the root and uses various methods to 
+determine which class should be used to obtain the data to be enqueued. 
+
+In the standard configuration, this process is rather easy. 
+At each node we look for an instruction, and then go to the class the 
+instruction refers to. If the class found is a leaf-node (without 
+children), we enqueue the packet there. If it is not yet a leaf node, we do 
+the same thing over again starting from that node. 
+
+The following actions are performed in order at each node we visit, until 
+move to another node, or terminates the process.
+.TP
+(i)
+Consult filters attached to the class. If we are at a leaf node, we are done. 
+Otherwise, restart.
+.TP
+(ii)
+If none of the above returned with an instruction, send to the default class.
+.P
+./ This algorithm makes sure that a packet always ends up somewhere, even while
+./ you are busy building your configuration. 
+
+.SH QDISC
+The root of a PSP qdisc class tree has the following parameters:
+
+.TP 
+parent major:minor | root
+This mandatory parameter determines the place of the PSP instance, 
+either at the
+.B root
+of an interface or within an existing class.
+.TP
+handle major:
+Like all other qdiscs, the PSP can be assigned a handle. It should consist only
+of a major number, followed by a colon. Optional, but it is very useful 
+if classes will be generated within this qdisc.
+.TP 
+default minor-id
+Unclassified traffic is sent to the class with this minor-id.
+.TP
+rate rate
+Optional.  You can explicitly specify the maximum transmission rate.
+For example, if a 33MHz/32bit PCI bus is used to connect a Gigabit 
+Ethernet network interface, the bottleneck is the PCI bus, and the 
+system can not transmit packets at the rate of gigabit/sec. 
+
+.SH CLASSES
+Classes have a host of parameters to configure their operation.
+
+.TP 
+parent major:minor
+Specifies the place of this class within the hierarchy. If attached directly 
+to a qdisc and not to another class, minor can be omitted. Mandatory.
+.TP 
+classid major:minor
+Like qdiscs, classes can be named. The major number must be equal to the
+major number of the qdisc to which it belongs. Optional, but needed if this 
+class is going to have children.
+.TP 
+rate rate
+Maximum transmission rate this class including all its children are assigned. 
+Optional, but required if this class is set to mode 1 (static target rate).
+.TP
+mode mode
+Range from 0 to 1.  The mode 0 is without pacing.  The mode 1 is
+pacing ba

[RFC][PATCH 1/3] NET_SCHED: PSPacer qdisc module

2007-11-21 Thread Ryousei Takano
This patch includes the PSPacer (Precise Software Pacer) qdisc
module, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 include/linux/pkt_sched.h |   38 ++
 net/sched/Kconfig |9 +
 net/sched/Makefile|1 +
 net/sched/sch_psp.c   |  959 +
 4 files changed, 1007 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 919af93..d3f8afd 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,44 @@ enum {
 
 #define TCA_ATM_MAX(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+typedef long long gapclock_t;
+
+enum {
+   MODE_NORMAL = 0,
+   MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+   __u32   level;
+   __u32   mode;
+   __u32   rate;
+};
+
+struct tc_psp_qopt
+{
+   __u32   defcls;
+   __u32   rate;
+   __u32   direct_pkts;
+};
+
+struct tc_psp_xstats
+{
+   __u32   bytes;  /* gap packet statistics */
+   __u32   packets;
+};
+
+enum
+{
+   TCA_PSP_UNSPEC,
+   TCA_PSP_COPT,
+   TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9c15c48..ec40e43 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -184,6 +184,15 @@ config NET_SCH_DSMARK
  To compile this code as a module, choose M here: the
  module will be called sch_dsmark.
 
+config NET_SCH_PSP
+   tristate "Precise Software Pacer (PSP)"
+   ---help---
+ Say Y here if you want to include PSPacer module, which means
+ that you will be able to control precise pacing.
+
+ To compile this driver as a module, choose M here: the
+ module will be called sch_psp.
+
 config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 81ecbe8..85425c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
 obj-$(CONFIG_NET_SCH_ATM)  += sch_atm.o
+obj-$(CONFIG_NET_SCH_PSP)  += sch_psp.o
 obj-$(CONFIG_NET_SCH_NETEM)+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)  += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
diff --git a/net/sched/sch_psp.c b/net/sched/sch_psp.c
new file mode 100644
index 000..5c56742
--- /dev/null
+++ b/net/sched/sch_psp.c
@@ -0,0 +1,959 @@
+/*
+ * net/sched/sch_psp.c PSPacer: Precise Software Pacer
+ *
+ * Copyright (C) 2004-2007 National Institute of Advanced 
+ * Industrial Science and Technology (AIST), Japan.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Ryousei Takano, <[EMAIL PROTECTED]>
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* PSPacer achieves precise rate regulation results, and no microscopic
+ * burst transmission which exceeds the limit is generated.
+ *
+ * The basic idea is that transmission timing can be precisely controlled,
+ * if packets are sent back-to-back at the wire rate.  PSPacer controls 
+ * the packet transmision intervals by inserting additional packets, 
+ * called gap packets, between adjacent packets.  The transmission interval
+ * can be controlled accurately by adjusting the number and size of the gap
+ * packets. PSPacer uses the 802.3x PAUSE frame as the gap packet.
+ *
+ * For the purpose of adjusting the gap size, this Qdisc maintains a byte
+ * clock which is recorded by a total transmitted byte per connection.
+ * Each sub-class has a class local clock which is used to make decision
+ * whether to send a packet or not.  If there is not any packets to send,
+ * gap packets are inserted.
+ *
+ * References:
+ * [1] R.Takano, T.Kudoh, Y.Kodama, M.Matsuda, H.Tezuka, and Y.Ishikawa,
+ * "Design and Evaluation of Precise Software Pacing Mechanisms for
+ * Fast Long-Distance Networks", PFLDnet2005.
+ * [2] http://www.gridmpi.org/gridtcp.jsp
+ */
+
+#define HW_GAP (16)/* Preamble(8) + Inter Frame Gap(8) */
+#define FCS(4) /* Frame Check Sequence(4) */
+#define MIN_GAP (64)   /* Minimum size of gap packet */
+#define MIN_TARGET_RATE (1000) /* 1 KB/s (= 8 Kbps) */
+
+#define PSP_HSIZE (16)
+
+#define BIT2BYTE(n) ((n) >> 3)
+
+struct psp_class
+{
+  

[PATCH net-2.6] [TCP]: fix D-SACK cwnd handling

2007-10-25 Thread Ryousei Takano
In the current net-2.6 kernel, handling FLAG_DSACKING_ACK is broken.
The flag is cleared to 1 just after FLAG_DSACKING_ACK is set.

if (found_dup_sack)
flag |= FLAG_DSACKING_ACK;
:
flag = 1;

To fix it, this patch introduces a part of the tcp_sacktag_state patch:
http://marc.info/?l=linux-netdev&m=119210560431519&w=2

Do you plan to apply the tcp_sacktag_state patch?
Or please apply this patch.

Signed-off-by: David S. Miller <[EMAIL PROTECTED]>
Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]>
Signed-off-by: Ryousei Takano <[EMAIL PROTECTED]>
---
 net/ipv4/tcp_input.c |   12 +---
 1 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3dbbb44..59e3c9a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1248,6 +1248,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
int cached_fack_count;
int i;
int first_sack_index;
+   int force_one_sack;
 
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1272,18 +1273,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
 * if the only SACK change is the increase of the end_seq of
 * the first block then only apply that SACK block
 * and use retrans queue hinting otherwise slowpath */
-   flag = 1;
+   force_one_sack = 1;
for (i = 0; i < num_sacks; i++) {
__be32 start_seq = sp[i].start_seq;
__be32 end_seq = sp[i].end_seq;
 
if (i == 0) {
if (tp->recv_sack_cache[i].start_seq != start_seq)
-   flag = 0;
+   force_one_sack = 0;
} else {
if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
(tp->recv_sack_cache[i].end_seq != end_seq))
-   flag = 0;
+   force_one_sack = 0;
}
tp->recv_sack_cache[i].start_seq = start_seq;
tp->recv_sack_cache[i].end_seq = end_seq;
@@ -1295,7 +1296,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
}
 
first_sack_index = 0;
-   if (flag)
+   if (force_one_sack)
num_sacks = 1;
else {
int j;
@@ -1321,9 +1322,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
}
}
 
-   /* clear flag as used for different purpose in following code */
-   flag = 0;
-
/* Use SACK fastpath hint if valid */
cached_skb = tp->fastpath_skb_hint;
cached_fack_count = tp->fastpath_cnt_hint;
-- 
1.5.3.4

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html