On Thu, Jul 02, 2009 at 03:23:04PM +0200, Lars Ellenberg wrote:
> On Thu, Jul 02, 2009 at 11:30:25AM +0300, Amir Vadai wrote:
> > Please attach the perl script to reproduce and I will check it.
> > As to the second problem - I did notice such behavior but couldn't
> > find a scenario to reproduce it. I guess it happen when the socket is
> > closed due to error.
> > 
> > Tell me if you notice any message in dmesg.
> 
> My former test cluster has been reassigned.
> The new test hardware will be available earliest tomorrow.
> But I don't think there was anything relevant in dmesg.
> 
> I'll try to reproduce on the new test hardware then,
> and will get back to you as soon as possible.

ok.
finally new test hardware working.

this is on debian lenny,
userland ofed from http://pkg-ofed.alioth.debian.org/apt/ofed/

kernel git://git.openfabrics.org/ofed_1_4/linux-2.6.git
merged with upstream stable
git://git4.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.27.y.git

both as of today:
  ofed_kernel            08acda8 sdp: Fix memory leak in bzcopy
  linux-v2.6.27.y/master 49cbf40 Linux 2.6.27.26


kernel config: very many "kernel debugging" things enabled.
if you want me to try a certain .config, or anything,
this can be arranged.

two very ugly perl scripts attached,
one tcp server,
one tcp client,
adapted from the perlipc man page.

client connects,
sends a package,
receives a package
in an endless loop.

package format:
4 byte magic, 2 byte ignored, 2 byte payload length
indicated length of payload, all same bytes,
but the trailing 4 byte, which again is a magic number.

with ethernet, or IPoIB: runs endless.

with LD_PRELOAD=libsdp.so runs for very few iterations,
and errors out on one of the sanity checks.


sample output:
r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:08 my_client.pl 4300: recv hdr [25]: invalid magic:  32 30 30 
39 2d 30 37 2d

which hapeens to be the hexdump of the string "2009-07-".
where did it copy_user() that from? wtf?

r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:08 my_client.pl 4301: recv payload [35]: expected 12296, but 
received 12295 byte; last bytes received:  55 e4 e3 e2
                                 ^^^^ pid,           ^^ seq number.
so for only 35 ping/pongs it did work ok.

exactly: one byte too short.
the trailing magic expected is e4 e3 e1 e1

r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:09 my_client.pl 4302: recv payload [33]: expected 4131, but 
received 4130 byte; last bytes received:  55 e4 e3 e2

r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:10 my_client.pl 4303: recv payload [29]: expected 16401, but 
received 16400 byte; last bytes received:  55 e4 e3 e2

r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:12 my_client.pl 4304: recv payload [21]: expected 4110, but 
received 4109 byte; last bytes received:  55 e4 e3 e2

r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:13 my_client.pl 4305: recv payload [4]: expected 20495, but 
received 20494 byte; last bytes received:  55 e4 e3 e2

r...@kugel:/home/lars/DRBD/IB_SDP# LD_PRELOAD=libsdp.so perl my_client.pl 
rum-ib0
2009-07-07 19:07:14 my_client.pl 4306: recv payload [12]: expected 22530, but 
received 22529 byte; last bytes received:  55 e4 e3 e2


any suggestions how to proceed from here?

--
: Lars Ellenberg
: LINBIT | Your Way to High Availability
: DRBD/HA support and consulting http://www.linbit.com

DRBD® and LINBIT® are registered trademarks of LINBIT, Austria.
#!/usr/bin/perl -w

# adapted from perlipc(1)

use strict;
use Socket;
use POSIX 'strftime';
my ($remote,$port, $iaddr, $paddr, $proto, $line);
my $seq_nr = 0;

$remote  = shift || 'localhost';
$port    = shift || 2345;  # random port
if ($port =~ /\D/) { $port = getservbyname($port, 'tcp') }
die "No port" unless $port;
$iaddr   = inet_aton($remote)               || die "no host: $remote";
$paddr   = sockaddr_in($port, $iaddr);

$proto   = getprotobyname('tcp');
socket(SOCK, PF_INET, SOCK_STREAM, $proto)  || die "socket: $!";
connect(SOCK, $paddr)    || die "connect: $!";

exit 1 if ping_pong_with_guard_bytes();

close (SOCK)            || die "close: $!";
exit;

sub logmsg { print strftime("%F %T", localtime), " $0 $$: @_\n" }

sub random_packet()
{
	# TODO: add checksum
	my $p_len = int(rand(33000));
	pack("Nnn", 0xb1b2b3b4, 0x3333, $p_len + 4) .
	("\x55" x $p_len) . pack("N", 0xe4e3e2e1);
}

sub receive_and_validate_packet {
	my ($hdr, $payload, $magic, $cmd, $len);
	$hdr = "\xff\xee\xdd\xcc" x 1024; # stupid; just to see where the corruption is from.
	if (!defined(recv SOCK, $hdr, 8, MSG_WAITALL)) {
		logmsg "recv hdr error [$seq_nr]: $!";
		return 1;
	}
	if (length($hdr) != 8) {
		logmsg "recv hdr [$seq_nr]: too short: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
		return 1;
	}
	($magic, $cmd, $len) = unpack "Nnn", $hdr;
	if ($magic != 0xb1b2b3b4) {
		logmsg "recv hdr [$seq_nr]: invalid magic: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
		return 1;
	}
	if (!defined(recv SOCK, $payload, $len, MSG_WAITALL)) {
		logmsg "recv payload error [$seq_nr]: $!";
		return 1;
	}
	my $plen = length($payload);
	my $p_last_4 = substr($payload, -4, 4);
	my $p_trail_magic = unpack "N", $p_last_4;
	if ($plen != $len) {
		logmsg "recv payload [$seq_nr]: expected $len, but received $plen byte; last bytes received: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
		return 1;
	}
	if ($p_trail_magic != 0xe4e3e2e1) {
		logmsg "recv payload [$seq_nr]: invalid trailing magic; last bytes received: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
		return 1;
	}
	++$seq_nr;
	return 0;
}

sub send_random_packet() {
	my $p = random_packet();
	my $p_len = length($p);
	# this is supposedly a blocking send,
	# so it should send the whole packet!
	my $s_len = send SOCK, $p, 0;
	if (!defined($s_len)) {
		logmsg "send error: $!";
		return 1;
	}
	if ($s_len != $p_len) {
		logmsg "send error: wanted to send $p_len, but could only send $s_len bytes";
		return 1;
	}
}

sub ping_pong_with_guard_bytes {
	my $ret;
	while (1) {
		$ret = send_random_packet();
		return $ret if $ret;
		$ret = receive_and_validate_packet();
		return $ret if $ret;
	}
}
#!/usr/bin/perl -Tw

# adapted from perlipc(1)

use strict;
use POSIX 'strftime';
use Socket;
use Carp;

sub spawn;  # forward declaration
sub logmsg { print strftime("%F %T", localtime), " $0 $$: @_\n" }

my $port = shift || 2345;
my $proto = getprotobyname('tcp');

($port) = $port =~ /^(\d+)$/                        or die "invalid port";

socket(Server, PF_INET, SOCK_STREAM, $proto)        || die "socket: $!";
setsockopt(Server, SOL_SOCKET, SO_REUSEADDR,
			       pack("l", 1))   || die "setsockopt: $!";
bind(Server, sockaddr_in($port, INADDR_ANY))        || die "bind: $!";
listen(Server,SOMAXCONN)                            || die "listen: $!";

logmsg "server started on port $port";

my $waitedpid = 0;
my $paddr;

use POSIX ":sys_wait_h";
use Errno;

sub REAPER {
	local $!;   # don't let waitpid() overwrite current error
	while ((my $pid = waitpid(-1,WNOHANG)) > 0 && WIFEXITED($?)) {
		logmsg "reaped $waitedpid" . ($? ? " with exit $?" : '');
	}
	$SIG{CHLD} = \&REAPER;  # loathe sysV
}

$SIG{CHLD} = \&REAPER;

my $seq_nr;
while(1) {
	$paddr = accept(Client, Server) || do {
		# try again if accept() returned because a signal was received
		next if $!{EINTR};
		die "accept: $!";
	};
	my ($port, $iaddr) = sockaddr_in($paddr);
	my $name = gethostbyaddr($iaddr, AF_INET);

	logmsg "connection from $name [", inet_ntoa($iaddr), "] at port $port";

	spawn \&pong_ping_with_guard_bytes;
	close Client;
}

sub spawn {
	my $coderef = shift;

	unless (@_ == 0 && $coderef && ref($coderef) eq 'CODE') {
		confess "usage: spawn CODEREF";
	}

	my $pid;
	if (! defined($pid = fork)) {
		logmsg "cannot fork: $!";
		return;
	}
	elsif ($pid) {
		logmsg "begat $pid";
		return; # I'm the parent
	}
	# else I'm the child -- go spawn

	$seq_nr = 0;

	open(STDIN,  "<&Client")   || die "can't dup client to stdin";
	open(STDOUT, ">&Client")   || die "can't dup client to stdout";
	## open(STDERR, ">&STDOUT") || die "can't dup stdout to stderr";
	exit &$coderef();
}

sub random_packet()
{
	# TODO: add checksum
	my $p_len = int(rand(33000));
	pack("Nnn", 0xb1b2b3b4, 0x3333, $p_len + 4) .
	("\x55" x $p_len) . pack("N", 0xe4e3e2e1);
}

sub receive_and_validate_packet {
	my ($hdr, $payload, $magic, $cmd, $len);
	$hdr = "\xbb\xaa\x99\x88" x 1024; # stupid; just to see where the corruption is from.
	if (!defined(recv STDIN, $hdr, 8, MSG_WAITALL)) {
		logmsg "recv hdr error [$seq_nr]: $!";
		return 1;
	}
	if (length($hdr) != 8) {
		logmsg "recv hdr [$seq_nr]: too short: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
		return 1;
	}
	($magic, $cmd, $len) = unpack "Nnn", $hdr;
	if ($magic != 0xb1b2b3b4) {
		logmsg "recv hdr [$seq_nr]: invalid magic: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $hdr;
		return 1;
	}
	if (!defined(recv STDIN, $payload, $len, MSG_WAITALL)) {
		logmsg "recv payload error [$seq_nr]: $!";
		return 1;
	}
	my $plen = length($payload);
	my $p_last_4 = substr($payload, -4, 4);
	my $p_trail_magic = unpack "N", $p_last_4;
	if ($plen != $len) {
		logmsg "recv payload [$seq_nr]: expected $len, but received $plen byte; last bytes received: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
		return 1;
	}
	if ($p_trail_magic != 0xe4e3e2e1) {
		logmsg "recv payload [$seq_nr]: invalid trailing magic; last bytes received: ",
			join " ", map { sprintf "%02x", $_ } unpack "C*", $p_last_4;
		return 1;
	}
	++$seq_nr;
	return 0;
}

sub send_random_packet() {
	my $p = random_packet();
	my $p_len = length($p);
	# this is supposedly a blocking send,
	# so it should send the whole packet!
	my $s_len = send STDOUT, $p, 0;
	if (!defined($s_len)) {
		logmsg "send error: $!";
		return 1;
	}
	if ($s_len != $p_len) {
		logmsg "send error: wanted to send $p_len, but could only send $s_len bytes";
		return 1;
	}
}

sub pong_ping_with_guard_bytes {
	my $ret;
	while (1) {
		$ret = receive_and_validate_packet();
		return $ret if $ret;
		$ret = send_random_packet();
		return $ret if $ret;
	}
	logmsg "received $seq_nr packets\n";
	return 0;
}
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to