Hi,

On 2023-03-01 09:02:00 -0800, Andres Freund wrote:
> On 2023-03-01 11:12:35 +0200, Heikki Linnakangas wrote:
> > On 27/02/2023 23:45, Andres Freund wrote:
> > > But, uh, isn't this code racy? Because this doesn't go through shared 
> > > buffers,
> > > there's no IO_IN_PROGRESS interlocking against a concurrent reader. We 
> > > know
> > > that writing pages isn't atomic vs readers. So another connection could
> > > connection could see the new relation size, but a read might return a
> > > partially written state of the page. Which then would cause checksum
> > > failures. And even worse, I think it could lead to loosing a write, if the
> > > concurrent connection writes out a page.
> > 
> > fsm_readbuf and vm_readbuf check the relation size first, with
> > smgrnblocks(), before trying to read the page. So to have a problem, the
> > smgrnblocks() would have to already return the new size, but the smgrread()
> > would not return the new contents. I don't think that's possible, but not
> > sure.
> 
> I hacked Thomas' program to test torn reads to ftruncate the file on the write
> side.
> 
> It frequently observes a file size that's not the write size (e.g. reading 4k
> when writing an 8k block).
> 
> After extending the test to more than one reader, I indeed also see torn
> reads. So far all the tears have been at a 4k block boundary. However so far
> it always has been *prior* page contents, not 0s.

On tmpfs the failure rate is much higher, and we also end up reading 0s,
despite never writing them.

I've attached my version of the test program.

ext4: lots of 4k reads with 8k writes, some torn reads at 4k boundaries
xfs: no issues
tmpfs: loads of 4k reads with 8k writes, lots torn reads reading 0s, some torn 
reads at 4k boundaries


Greetings,

Andres Freund
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#define LOOPS 100000000
#define BYTES 8192

static void
writer(int fd)
{
	char buffer1[BYTES];
	char buffer2[BYTES];

	memset(buffer1, '1', sizeof(buffer1));
	memset(buffer2, '2', sizeof(buffer2));

	for (int i = 0; i < LOOPS; ++i) {
		if (pwrite(fd, i % 2 == 0 ? buffer1 : buffer2, BYTES, 0) != BYTES) {
			perror("pwrite failed");
			break;
		}
		if (ftruncate(fd, 0) != 0)
		{
			perror("ftruncate failed");
			break;
		}
	}

	close(fd);
}

static void
reader(int readerno, int fd)
{
	char buffer[BYTES];
	char buffer1[BYTES];
	char buffer2[BYTES];

	memset(buffer1, '1', sizeof(buffer1));
	memset(buffer2, '2', sizeof(buffer2));

	for (int i = 0; i < LOOPS; ++i) {
		ssize_t r;

		r = pread(fd, buffer, BYTES, 0);
		if (r == 0 || r == 4096)
			continue;
		else if (r != BYTES) {
			fprintf(stderr, "[%d] perror: returned %zd: %m\n",
					readerno, r);
		}
		if (memcmp(buffer, buffer1, BYTES) != 0 &&
			memcmp(buffer, buffer2, BYTES) != 0) {
			char b0 = buffer[0];
			if (b0 == 0)
			{
				printf("found 0\n");
			}
			for (int i = 1; i < BYTES; ++i)
			{
				if (b0 != buffer[i])
				{
					fprintf(stderr, "[%d] difference at byte %d, byte before: %02x, byte: %02x\n",
							readerno, i, buffer[i - 1], buffer[i]);
					break;
				}
			}
		}
	}
	close(fd);

	return;
}

int main(int argc, char *argv[])
{
#define NCHILDREN 80
	int childpids[NCHILDREN];
	int fd;

	fd = open("test-file", O_RDWR | O_CREAT, 0644);
	if (fd < 0) {
		perror("open");
		exit(EXIT_FAILURE);
	}

	for (int i = 0; i < NCHILDREN; i++)
	{
		int pid = fork();

		if (pid < 0) {
			fprintf(stderr, "fork failed\n");
			return EXIT_FAILURE;
		}
		else if (pid == 0)
		{
			reader(i, fd);
			fprintf(stderr, "[%d] reader done\n", i);
			exit(0);
		}
		else
		{
			childpids[i] = pid;
		}
	}

	int status;

	writer(fd);
	fprintf(stderr, "writer done, killing readers\n");

	for (int i = 0; i < NCHILDREN; i++)
		kill(childpids[i], SIGTERM);

	for (int i = 0; i < NCHILDREN; i++)
		waitpid(childpids[i], &status, 0);

	return status;
}

Reply via email to