Quoting Dan Smith ([email protected]):
> This patch adds basic support for C/R of open INET sockets.  I think that
> all the important bits of the TCP and ICSK socket structures is saved,
> but I think there is still some additional IPv6 stuff that needs to be
> handled.
> 
> With this patch applied, the following script can be used to demonstrate
> the functionality:
> 
>   
> https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html
> 
> It shows that this enables migration of a sendmail process with open
> connections from one machine to another without dropping.
> 
> We probably need comments from the netdev people about the quality of
> sanity checking we do on the values in the ckpt_hdr_socket_inet
> structure on restart.
> 
> Note that this still doesn't address lingering sockets yet.
> 
> Changes in v5:
>  - Change ckpt_write_err() to ckpt_err()
> 
> Changes in v4:
>  - Use the new socket buffer restore functions introduced in the
>    previous patch
>  - Move listen_sockets list under the restart items in ckpt_ctx
>  - Rename RESTART_SOCK_LISTENONLY to RESTART_CONN_RESET
> 
> Changes in v3:
>  - Prevent restart from allowing a bind on a <1024 port unless the
>    user is granted that capability
>  - Add some sanity checking in the inet_precheck() function to make sure
>    the values read from the checkpoint image are within acceptable ranges
>  - Check the result of sock_restore_header_info() and fail if needed
> 
> Changes in v2:
>  - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
>    structure instead of saving them separately
>  - Fix 'sock' naming in sock_cptrst()
>  - Don't take the queue lock before skb_queue_tail() since it is
>    done for us
>  - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
>    flag is specified on sys_restart()
>  - Pull the implementation of the list of listening sockets back into
>    this patch
>  - Fix dangling printk
>  - Add some comments around the parent/child restore logic
> 
> Cc: [email protected]
> Acked-by: Oren Laadan <[email protected]>
> Signed-off-by: Dan Smith <[email protected]>

Acked-by: Serge Hallyn <[email protected]>

> ---
>  checkpoint/sys.c                 |    4 +
>  include/linux/checkpoint.h       |    5 +-
>  include/linux/checkpoint_hdr.h   |   95 +++++++++
>  include/linux/checkpoint_types.h |    1 +
>  net/checkpoint.c                 |   27 ++--
>  net/ipv4/checkpoint.c            |  391 
> ++++++++++++++++++++++++++++++++++----
>  6 files changed, 473 insertions(+), 50 deletions(-)
> 
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 9f9e825..baed891 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -244,6 +244,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
> 
>       kfree(ctx->pids_arr);
> 
> +     sock_listening_list_free(&ctx->listen_sockets);
> +
>       kfree(ctx);
>  }
> 
> @@ -274,6 +276,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned 
> long uflags,
> 
>       mutex_init(&ctx->msg_mutex);
> 
> +     INIT_LIST_HEAD(&ctx->listen_sockets);
> +
>       err = -EBADF;
>       ctx->file = fget(fd);
>       if (!ctx->file)
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 0eff43e..ddc9aa0 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -20,6 +20,7 @@
>  #define RESTART_FROZEN               0x2
>  #define RESTART_GHOST                0x4
>  #define RESTART_KEEP_LSM     0x8
> +#define RESTART_CONN_RESET      0x10
> 
>  /* misc user visible */
>  #define CHECKPOINT_FD_NONE   -1
> @@ -53,7 +54,8 @@
>       (RESTART_TASKSELF | \
>        RESTART_FROZEN | \
>        RESTART_KEEP_LSM | \
> -      RESTART_GHOST)
> +      RESTART_GHOST | \
> +      RESTART_CONN_RESET)
>  #define CKPT_LSM_INFO_LEN 200
>  #define CKPT_LSM_STRING_MAX 1024
> 
> @@ -105,6 +107,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
>                             struct sockaddr *loc, unsigned *loc_len,
>                             struct sockaddr *rem, unsigned *rem_len);
>  struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
> +void sock_listening_list_free(struct list_head *head);
> 
>  /* ckpt kflags */
>  #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 787cf89..d1a93e3 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -15,6 +15,7 @@
>  #include <linux/socket.h>
>  #include <linux/un.h>
>  #include <linux/in.h>
> +#include <linux/in6.h>
>  #else
>  #include <sys/types.h>
>  #include <linux/types.h>
> @@ -625,6 +626,100 @@ struct ckpt_hdr_socket_unix {
> 
>  struct ckpt_hdr_socket_inet {
>       struct ckpt_hdr h;
> +     __u32 daddr;
> +     __u32 rcv_saddr;
> +     __u32 saddr;
> +     __u16 dport;
> +     __u16 num;
> +     __u16 sport;
> +     __s16 uc_ttl;
> +     __u16 cmsg_flags;
> +
> +     struct {
> +             __u64 timeout;
> +             __u32 ato;
> +             __u32 lrcvtime;
> +             __u16 last_seg_size;
> +             __u16 rcv_mss;
> +             __u8 pending;
> +             __u8 quick;
> +             __u8 pingpong;
> +             __u8 blocked;
> +     } icsk_ack __attribute__ ((aligned(8)));
> +
> +     /* FIXME: Skipped opt, tos, multicast, cork settings */
> +
> +     struct {
> +             __u32 rcv_nxt;
> +             __u32 copied_seq;
> +             __u32 rcv_wup;
> +             __u32 snd_nxt;
> +             __u32 snd_una;
> +             __u32 snd_sml;
> +             __u32 rcv_tstamp;
> +             __u32 lsndtime;
> +
> +             __u32 snd_wl1;
> +             __u32 snd_wnd;
> +             __u32 max_window;
> +             __u32 mss_cache;
> +             __u32 window_clamp;
> +             __u32 rcv_ssthresh;
> +             __u32 frto_highmark;
> +
> +             __u32 srtt;
> +             __u32 mdev;
> +             __u32 mdev_max;
> +             __u32 rttvar;
> +             __u32 rtt_seq;
> +
> +             __u32 packets_out;
> +             __u32 retrans_out;
> +
> +             __u32 snd_up;
> +             __u32 rcv_wnd;
> +             __u32 write_seq;
> +             __u32 pushed_seq;
> +             __u32 lost_out;
> +             __u32 sacked_out;
> +             __u32 fackets_out;
> +             __u32 tso_deferred;
> +             __u32 bytes_acked;
> +
> +             __s32 lost_cnt_hint;
> +             __u32 retransmit_high;
> +
> +             __u32 lost_retrans_low;
> +
> +             __u32 prior_ssthresh;
> +             __u32 high_seq;
> +
> +             __u32 retrans_stamp;
> +             __u32 undo_marker;
> +             __s32 undo_retrans;
> +             __u32 total_retrans;
> +
> +             __u32 urg_seq;
> +             __u32 keepalive_time;
> +             __u32 keepalive_intvl;
> +
> +             __u16 urg_data;
> +             __u16 advmss;
> +             __u8 frto_counter;
> +             __u8 nonagle;
> +
> +             __u8 ecn_flags;
> +             __u8 reordering;
> +
> +             __u8 keepalive_probes;
> +     } tcp __attribute__ ((aligned(8)));
> +
> +     struct {
> +             struct in6_addr saddr;
> +             struct in6_addr rcv_saddr;
> +             struct in6_addr daddr;
> +     } inet6 __attribute__ ((aligned(8)));
> +
>       __u32 laddr_len;
>       __u32 raddr_len;
>       struct sockaddr_in laddr;
> diff --git a/include/linux/checkpoint_types.h 
> b/include/linux/checkpoint_types.h
> index 77f8592..79c9c09 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -82,6 +82,7 @@ struct ckpt_ctx {
>       wait_queue_head_t waitq;        /* waitqueue for restarting tasks */
>       wait_queue_head_t ghostq;       /* waitqueue for ghost tasks */
>       struct cred *realcred, *ecred;  /* tmp storage for cred at restart */
> +     struct list_head listen_sockets;/* listening parent sockets */
> 
>       struct ckpt_stats stats;        /* statistics */
> 
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> index 49d9a2f..aba1497 100644
> --- a/net/checkpoint.c
> +++ b/net/checkpoint.c
> @@ -324,6 +324,7 @@ static int __sock_write_skb(struct ckpt_ctx *ctx,
> 
>  static int __sock_write_buffers(struct ckpt_ctx *ctx,
>                               struct sk_buff_head *queue,
> +                             uint16_t family,
>                               int dst_objref)
>  {
>       struct sk_buff *skb;
> @@ -336,11 +337,11 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>                       return -EBUSY;
>               }
> 
> -             /* The other ancillary messages are always present
> -              * unlike descriptors.  Even though we can't detect
> -              * them and fail the checkpoint, we're not at risk
> -              * because we don't save out (or restore) the control
> -              * information contained in the skb.
> +             /* The other ancillary messages UNIX are always
> +              * present unlike descriptors.  Even though we can't
> +              * detect them and fail the checkpoint, we're not at
> +              * risk because we don't restore the control
> +              * information in the UNIX code.
>                */
> 
>               ret = __sock_write_skb(ctx, skb, dst_objref);
> @@ -353,6 +354,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
> 
>  static int sock_write_buffers(struct ckpt_ctx *ctx,
>                             struct sk_buff_head *queue,
> +                           uint16_t family,
>                             int dst_objref)
>  {
>       struct ckpt_hdr_socket_queue *h;
> @@ -372,7 +374,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
>       h->skb_count = ret;
>       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
>       if (!ret)
> -             ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
> +             ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
> 
>   out:
>       ckpt_hdr_put(ctx, h);
> @@ -394,12 +396,14 @@ int sock_deferred_write_buffers(void *data)
>               return dst_objref;
>       }
> 
> -     ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
> +     ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
> +                              dq->sk->sk_family, dst_objref);
>       ckpt_debug("write recv buffers: %i\n", ret);
>       if (ret < 0)
>               return ret;
> 
> -     ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
> +     ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
> +                              dq->sk->sk_family, dst_objref);
>       ckpt_debug("write send buffers: %i\n", ret);
> 
>       return ret;
> @@ -924,10 +928,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
>               goto err;
> 
>       if ((h->sock_common.family == AF_INET) &&
> -         (h->sock.state != TCP_LISTEN)) {
> -             /* Temporary hack to enable restore of TCP_LISTEN sockets
> -              * while forcing anything else to a closed state
> -              */
> +         (h->sock.state != TCP_LISTEN) &&
> +         (ctx->uflags & RESTART_CONN_RESET)) {
> +             ckpt_debug("Forcing open socket closed\n");
>               sock->sk->sk_state = TCP_CLOSE;
>               sock->state = SS_UNCONNECTED;
>       }
> diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
> index 9cbbf5e..3e20cc9 100644
> --- a/net/ipv4/checkpoint.c
> +++ b/net/ipv4/checkpoint.c
> @@ -17,6 +17,7 @@
>  #include <linux/deferqueue.h>
>  #include <net/tcp_states.h>
>  #include <net/tcp.h>
> +#include <net/ipv6.h>
> 
>  struct dq_sock {
>       struct ckpt_ctx *ctx;
> @@ -28,6 +29,236 @@ struct dq_buffers {
>       struct sock *sk;
>  };
> 
> +struct listen_item {
> +     struct sock *sk;
> +     struct list_head list;
> +};
> +
> +void sock_listening_list_free(struct list_head *head)
> +{
> +     struct listen_item *item, *tmp;
> +
> +     list_for_each_entry_safe(item, tmp, head, list) {
> +             list_del(&item->list);
> +             kfree(item);
> +     }
> +}
> +
> +static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +     struct listen_item *item;
> +
> +     item = kmalloc(sizeof(*item), GFP_KERNEL);
> +     if (!item)
> +             return -ENOMEM;
> +
> +     item->sk = sk;
> +     list_add(&item->list, &ctx->listen_sockets);
> +
> +     return 0;
> +}
> +
> +static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +     struct listen_item *item;
> +
> +     list_for_each_entry(item, &ctx->listen_sockets, list) {
> +             if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
> +                     return item->sk;
> +     }
> +
> +     return NULL;
> +}
> +
> +static int sock_hash_parent(void *data)
> +{
> +     struct dq_sock *dq = (struct dq_sock *)data;
> +     struct sock *parent;
> +
> +     ckpt_debug("INET post-restart hash\n");
> +
> +     dq->sk->sk_prot->hash(dq->sk);
> +
> +     /* If there is a listening socket with the same source port,
> +      * then become a child of that socket [we are the result of an
> +      * accept()].  Otherwise hash ourselves directly in [we are
> +      * the result of a connect()]
> +      */
> +
> +     parent = sock_get_parent(dq->ctx, dq->sk);
> +     if (parent) {
> +             inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
> +             local_bh_disable();
> +             __inet_inherit_port(parent, dq->sk);
> +             local_bh_enable();
> +     } else {
> +             inet_sk(dq->sk)->num = 0;
> +             inet_hash_connect(&tcp_death_row, dq->sk);
> +             inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
> +     }
> +
> +     return 0;
> +}
> +
> +static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
> +{
> +     struct dq_sock dq;
> +
> +     dq.sk = sock;
> +     dq.ctx = ctx;
> +
> +     return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
> +                           sock_hash_parent, NULL);
> +}
> +
> +static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
> +                             struct tcp_sock *sk,
> +                             struct ckpt_hdr_socket_inet *hh,
> +                             int op)
> +{
> +     CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
> +     CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
> +     CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
> +     CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
> +     CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
> +     CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
> +     CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
> +     CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
> +
> +     CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
> +     CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
> +     CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
> +     CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
> +     CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
> +     CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
> +     CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
> +     CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
> +     CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
> +     CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
> +
> +     CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
> +     CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
> +     CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
> +     CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
> +     CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
> +
> +     CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
> +     CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
> +
> +     CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
> +     CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
> +     CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
> +     CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
> +
> +     CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
> +
> +     CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
> +     CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
> +     CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
> +     CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
> +     CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
> +     CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
> +     CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
> +     CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
> +
> +     CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
> +     CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
> +
> +     CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
> +
> +     CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
> +     CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
> +
> +     CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
> +     CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
> +     CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
> +     CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
> +
> +     CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
> +     CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
> +     CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
> +
> +     if (!skb_queue_empty(&sk->ucopy.prequeue))
> +             printk("PREQUEUE!\n");
> +
> +     return 0;
> +}
> +
> +static int sock_inet_restore_addrs(struct inet_sock *inet,
> +                                struct ckpt_hdr_socket_inet *hh)
> +{
> +     inet->daddr = hh->raddr.sin_addr.s_addr;
> +     inet->saddr = hh->laddr.sin_addr.s_addr;
> +     inet->rcv_saddr = inet->saddr;
> +
> +     inet->dport = hh->raddr.sin_port;
> +     inet->sport = hh->laddr.sin_port;
> +
> +     return 0;
> +}
> +
> +static int sock_inet_cptrst(struct ckpt_ctx *ctx,
> +                         struct sock *sk,
> +                         struct ckpt_hdr_socket_inet *hh,
> +                         int op)
> +{
> +     struct inet_sock *inet = inet_sk(sk);
> +     struct inet_connection_sock *icsk = inet_csk(sk);
> +     int ret;
> +
> +     if (op == CKPT_CPT) {
> +             CKPT_COPY(op, hh->daddr, inet->daddr);
> +             CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
> +             CKPT_COPY(op, hh->dport, inet->dport);
> +             CKPT_COPY(op, hh->saddr, inet->saddr);
> +             CKPT_COPY(op, hh->sport, inet->sport);
> +     } else {
> +             ret = sock_inet_restore_addrs(inet, hh);
> +             if (ret)
> +                     return ret;
> +     }
> +
> +     CKPT_COPY(op, hh->num, inet->num);
> +     CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
> +     CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
> +
> +     CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
> +     CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
> +     CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
> +     CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
> +     CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
> +     CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
> +     CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
> +     CKPT_COPY(op,
> +               hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
> +     CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
> +
> +     if (sk->sk_protocol == IPPROTO_TCP)
> +             ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
> +     else if (sk->sk_protocol == IPPROTO_UDP)
> +             ret = 0;
> +     else {
> +             ret = -EINVAL;
> +             ckpt_err(ctx, ret, "unknown socket protocol %d",
> +                      sk->sk_protocol);
> +     }
> +
> +     if (sk->sk_family == AF_INET6) {
> +             struct ipv6_pinfo *inet6 = inet6_sk(sk);
> +             if (op == CKPT_CPT) {
> +                     ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
> +                     ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
> +                     ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
> +             } else {
> +                     ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
> +                     ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
> +                     ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
> +             }
> +     }
> +
> +     return ret;
> +}
> +
>  int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
>  {
>       struct ckpt_hdr_socket_inet *in;
> @@ -43,6 +274,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket 
> *sock)
>       if (ret)
>               goto out;
> 
> +     ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
> +     if (ret < 0)
> +             goto out;
> +
>       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
>   out:
>       ckpt_hdr_put(ctx, in);
> @@ -55,51 +290,22 @@ int inet_collect(struct ckpt_ctx *ctx, struct socket 
> *sock)
>       return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
>  }
> 
> -static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +static int inet_read_buffer(struct ckpt_ctx *ctx,
> +                         struct sk_buff_head *queue)
>  {
> -     struct ckpt_hdr_socket_buffer *h;
> -     int len;
> -     int ret;
>       struct sk_buff *skb = NULL;
> 
> -     h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> -     if (IS_ERR(h))
> -             return PTR_ERR(h);
> -
> -     len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> -     if (len < 0) {
> -             ret = len;
> -             goto out;
> -     } else if (len > SKB_MAX_ALLOC) {
> -             ckpt_debug("Socket buffer too big (%i > %lu)",
> -                        len, SKB_MAX_ALLOC);
> -             ret = -ENOSPC;
> -             goto out;
> -     }
> -
> -     skb = alloc_skb(len, GFP_KERNEL);
> -     if (!skb) {
> -             ret = -ENOMEM;
> -             goto out;
> -     }
> -
> -     ret = ckpt_kread(ctx, skb_put(skb, len), len);
> -     if (ret < 0)
> -             goto out;
> +     skb = sock_restore_skb(ctx);
> +     if (IS_ERR(skb))
> +             return PTR_ERR(skb);
> 
> -     spin_lock(&queue->lock);
>       skb_queue_tail(queue, skb);
> -     spin_unlock(&queue->lock);
> - out:
> -     ckpt_hdr_put(ctx, h);
> -
> -     if ((ret < 0) && skb)
> -             kfree_skb(skb);
> 
> -     return ret;
> +     return skb->len;
>  }
> 
> -static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head 
> *queue)
> +static int inet_read_buffers(struct ckpt_ctx *ctx,
> +                          struct sk_buff_head *queue)
>  {
>       struct ckpt_hdr_socket_queue *h;
>       int ret = 0;
> @@ -162,6 +368,19 @@ static int inet_defer_restore_buffers(struct ckpt_ctx 
> *ctx, struct sock *sk)
> 
>  static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet 
> *in)
>  {
> +     __u8 icsk_ack_mask = ICSK_ACK_SCHED | ICSK_ACK_TIMER |
> +             ICSK_ACK_PUSHED | ICSK_ACK_PUSHED2;
> +     __u16 urg_mask = TCP_URG_VALID | TCP_URG_NOTYET | TCP_URG_READ;
> +     __u8 nonagle_mask = TCP_NAGLE_OFF | TCP_NAGLE_CORK | TCP_NAGLE_PUSH;
> +     __u8 ecn_mask = TCP_ECN_OK | TCP_ECN_QUEUE_CWR | TCP_ECN_DEMAND_CWR;
> +
> +     if ((htons(in->laddr.sin_port) < PROT_SOCK) &&
> +         !capable(CAP_NET_BIND_SERVICE)) {
> +             ckpt_debug("unable to bind to port %hu\n",
> +                        htons(in->laddr.sin_port));
> +             return -EINVAL;
> +     }
> +
>       if (in->laddr_len > sizeof(struct sockaddr_in)) {
>               ckpt_debug("laddr_len is too big\n");
>               return -EINVAL;
> @@ -172,6 +391,77 @@ static int inet_precheck(struct socket *sock, struct 
> ckpt_hdr_socket_inet *in)
>               return -EINVAL;
>       }
> 
> +     /* Set ato to the default */
> +     in->icsk_ack.ato = TCP_ATO_MIN;
> +
> +     /* No quick acks are scheduled after a restart */
> +     in->icsk_ack.quick = 0;
> +
> +     if (in->icsk_ack.pending & ~icsk_ack_mask) {
> +             ckpt_debug("invalid pending flags 0x%x\n",
> +                        in->icsk_ack.pending & ~icsk_ack_mask);
> +             return -EINVAL;
> +     }
> +
> +     if (in->icsk_ack.pingpong > 1) {
> +             ckpt_debug("invalid icsk_ack.pingpong value\n");
> +             return -EINVAL;
> +     }
> +
> +     if (in->icsk_ack.blocked > 1) {
> +             ckpt_debug("invalid icsk_ack.blocked value\n");
> +             return -EINVAL;
> +     }
> +
> +     /* do_tcp_setsockopt() quietly makes this coercion */
> +     if (in->tcp.window_clamp < (SOCK_MIN_RCVBUF / 2))
> +             in->tcp.window_clamp = SOCK_MIN_RCVBUF / 2;
> +     else if (in->tcp.window_clamp > 65535U) {
> +             ckpt_debug("invalid window_clamp value\n");
> +             return -EINVAL;
> +     }
> +
> +     if (in->tcp.rcv_ssthresh > (4U * in->tcp.advmss))
> +             in->tcp.rcv_ssthresh = 4U * in->tcp.advmss;
> +
> +     /* These will all be recalculated on the next call to
> +      * tcp_rtt_estimator()
> +      */
> +     in->tcp.srtt = in->tcp.mdev = in->tcp.mdev_max = 0;
> +     in->tcp.rttvar = in->tcp.rtt_seq = 0;
> +
> +     /* Might want to set packets_out to zero ? */
> +
> +     if (in->tcp.rcv_wnd > MAX_TCP_WINDOW)
> +             in->tcp.rcv_wnd = MAX_TCP_WINDOW;
> +
> +     if (in->tcp.keepalive_intvl > MAX_TCP_KEEPINTVL) {
> +             ckpt_debug("keepalive_intvl %i out of range\n",
> +                        in->tcp.keepalive_intvl);
> +             return -EINVAL;
> +     }
> +
> +     if (in->tcp.keepalive_probes > MAX_TCP_KEEPCNT) {
> +             ckpt_debug("Invalid keepalive_probes value %i\n",
> +                        in->tcp.keepalive_probes);
> +             return -EINVAL;
> +     }
> +
> +     if (in->tcp.urg_data & ~urg_mask) {
> +             ckpt_debug("Invalid urg_data value\n");
> +             return -EINVAL;
> +     }
> +
> +     if (in->tcp.nonagle & ~nonagle_mask) {
> +             ckpt_debug("Invalid nonagle value\n");
> +             return -EINVAL;
> +     }
> +
> +     if (in->tcp.ecn_flags & ~ecn_mask) {
> +             ckpt_debug("Invalid ecn_flags value\n");
> +             return -EINVAL;
> +     }
> +
>       return 0;
>  }
> 
> @@ -209,8 +499,35 @@ int inet_restore(struct ckpt_ctx *ctx,
>                       ckpt_debug("inet listen: %i\n", ret);
>                       if (ret < 0)
>                               goto out;
> +
> +                     /* We are a listening socket, so add ourselves
> +                      * to the list of parent sockets.  This will
> +                      * allow our children to find us later and
> +                      * link up
> +                      */
> +
> +                     ret = sock_listening_list_add(ctx, sock->sk);
> +                     if (ret < 0)
> +                             goto out;
>               }
>       } else {
> +             ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
> +             if (ret)
> +                     goto out;
> +
> +             if ((h->sock.state == TCP_ESTABLISHED) &&
> +                 (h->sock.protocol == IPPROTO_TCP)) {
> +                     /* A connected socket that was spawned from an
> +                      * accept() needs to be hashed with its parent
> +                      * listening socket in order to receive
> +                      * traffic on the original port.  Since we may
> +                      * not have restarted the parent yet, we defer
> +                      * this until later when we know we have all
> +                      * the listening sockets accounted for.
> +                      */
> +                     ret = sock_defer_hash(ctx, sock->sk);
> +             }
> +
>               if (!sock_flag(sock->sk, SOCK_DEAD))
>                       ret = inet_defer_restore_buffers(ctx, sock->sk);
>       }
> -- 
> 1.6.2.5
> 
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers
_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to