Re: [HACKERS] strncpy is not a safe version of strcpy

2014-08-14 Thread Tom Lane
Noah Misch n...@leadboat.com writes:
 On Wed, Aug 13, 2014 at 10:21:50AM -0400, Tom Lane wrote:
 I believe that we deal with this by the expedient of checking the lengths
 of tablespace paths in advance, when the tablespace is created.

 The files under scrutiny here are not located in a tablespace.  Even if they
 were, isn't the length of $PGDATA/pg_tblspc the important factor?

The length of $PGDATA is of no relevance whatsoever; we chdir into that
directory at startup, and subsequently all paths are implicitly relative
to there.  If there is any backend code that's prepending $PGDATA to
something else, it's wrong to start with.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Improvement of versioning on Windows, take two

2014-08-14 Thread Michael Paquier
On Tue, Aug 12, 2014 at 11:47 PM, MauMau maumau...@gmail.com wrote:
 From: Michael Paquier michael.paqu...@gmail.com
 Yes, the build succeeded.  I confirmed that the following files have version
 info.  However, unlike other files, they don't have file description.  Is
 this intended?
 bin\isolationtester.exe
 bin\pg_isolation_regress
 bin\pg_regress.exe
 bin\pg_regress_ecpg.exe
 bin\zic.exe

No... But after some additional hacking on this, I have been able to
complete that. This has for example required the addition of a new
function called AddUtilityResourceFile in Project.pm that is able to
scan a Makefile within a given folder and to extract PGFILEDESC and
PGAPPICON values that are then used to create a win32ver.rc in the
targetted build folder. Note that on MinGW all those exe/dll had file
description and version number even with previous version.

 lib\regress.dll
With MinGW, this had no file description and no version number. Of
course that was the same with MSVC. Fixed.

 lib\dict_snowball.dll has no version properties.
On MSVC there were no file description and no version number. On
MinGW, I saw a version number. Thanks for spotting that, I've fixed
it.

Regards,
-- 
Michael
diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile
index ac80efe..577c512 100644
--- a/src/backend/snowball/Makefile
+++ b/src/backend/snowball/Makefile
@@ -10,10 +10,13 @@ subdir = src/backend/snowball
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
+PGFILEDESC = snowball - set of dictionary templates
+PGAPPICON = win32
+
 override CPPFLAGS := -I$(top_srcdir)/src/include/snowball \
 	-I$(top_srcdir)/src/include/snowball/libstemmer $(CPPFLAGS)
 
-OBJS= dict_snowball.o api.o utilities.o \
+OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
 	stem_ISO_8859_1_danish.o \
 	stem_ISO_8859_1_dutch.o \
 	stem_ISO_8859_1_english.o \
diff --git a/src/interfaces/ecpg/test/Makefile b/src/interfaces/ecpg/test/Makefile
index 56f6a17..a2e0a83 100644
--- a/src/interfaces/ecpg/test/Makefile
+++ b/src/interfaces/ecpg/test/Makefile
@@ -14,6 +14,7 @@ override CPPFLAGS := \
 	$(CPPFLAGS)
 
 PGFILEDESC = ECPG Test - regression tests for ECPG
+PGAPPICON = win32
 
 # where to find psql for testing an existing installation
 PSQLDIR = $(bindir)
diff --git a/src/test/isolation/Makefile b/src/test/isolation/Makefile
index 26bcf22..77bc43d 100644
--- a/src/test/isolation/Makefile
+++ b/src/test/isolation/Makefile
@@ -6,12 +6,15 @@ subdir = src/test/isolation
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
+PGFILEDESC = pg_isolation_tester/isolationtester - isolation regression tests
+PGAPPICON = win32
+
 # where to find psql for testing an existing installation
 PSQLDIR = $(bindir)
 
 override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) -I$(srcdir)/../regress $(CPPFLAGS)
 
-OBJS =  specparse.o isolationtester.o
+OBJS =  specparse.o isolationtester.o $(WIN32RES)
 
 all: isolationtester$(X) pg_isolation_regress$(X)
 
@@ -21,7 +24,7 @@ submake-regress:
 pg_regress.o: | submake-regress
 	rm -f $@  $(LN_S) $(top_builddir)/src/test/regress/pg_regress.o .
 
-pg_isolation_regress$(X): isolation_main.o pg_regress.o
+pg_isolation_regress$(X): isolation_main.o pg_regress.o $(WIN32RES)
 	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@
 
 isolationtester$(X): $(OBJS) | submake-libpq submake-libpgport
diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile
index b084e0a..49c46c7 100644
--- a/src/test/regress/GNUmakefile
+++ b/src/test/regress/GNUmakefile
@@ -14,6 +14,9 @@ subdir = src/test/regress
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
+PGFILEDESC = pg_regress - regression tests
+PGAPPICON = win32
+
 # file with extra config for temp build
 TEMP_CONF =
 ifdef TEMP_CONFIG
@@ -43,7 +46,7 @@ EXTRADEFS = '-DHOST_TUPLE=$(host_tuple)' \
 
 all: pg_regress$(X)
 
-pg_regress$(X): pg_regress.o pg_regress_main.o | submake-libpgport
+pg_regress$(X): pg_regress.o pg_regress_main.o $(WIN32RES) | submake-libpgport
 	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@
 
 # dependencies ensure that path changes propagate
@@ -66,7 +69,7 @@ uninstall:
 # Build dynamically-loaded object file for CREATE FUNCTION ... LANGUAGE C.
 
 NAME = regress
-OBJS = regress.o
+OBJS = $(WIN32RES) regress.o
 
 include $(top_srcdir)/src/Makefile.shlib
 
diff --git a/src/timezone/Makefile b/src/timezone/Makefile
index ef739e9..ab65d22 100644
--- a/src/timezone/Makefile
+++ b/src/timezone/Makefile
@@ -12,11 +12,14 @@ subdir = src/timezone
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 
+PGFILEDESC = timezone - timezone database
+PGAPPICON = win32
+
 # files to build into backend
 OBJS= localtime.o strftime.o pgtz.o
 
 # files needed to build zic utility program
-ZICOBJS= zic.o ialloc.o scheck.o localtime.o
+ZICOBJS= zic.o ialloc.o scheck.o localtime.o $(WIN32RES)
 
 # timezone data files
 TZDATA = africa antarctica asia australasia europe 

Re: [HACKERS] pg_dump bug in 9.4beta2 and HEAD

2014-08-14 Thread Heikki Linnakangas

On 08/14/2014 06:53 AM, Joachim Wieland wrote:

I'm seeing an assertion failure with pg_dump -c --if-exists which is
not ready to handle BLOBs it seems:

pg_dump: pg_backup_archiver.c:472: RestoreArchive: Assertion `mark !=
((void *)0)' failed.

To reproduce:

$ createdb test
$ pg_dump -c --if-exists test  (works, dumps empty database)
$ psql test -c select lo_create(1);
$ pg_dump -c --if-exists test  (fails, with the above mentioned assertion)


The code tries to inject an IF EXISTS into the already-construct DROP 
command, but it doesn't work for large objects, because the deletion 
command looks like SELECT pg_catalog.lo_unlink(xxx). There is no DROP 
there.


I believe we could use SELECT pg_catalog.lo_unlink(loid) FROM 
pg_catalog.pg_largeobject_metadata WHERE loid = xxx. 
pg_largeobject_metadata table didn't exist before version 9.0, but we 
don't guarantee pg_dump's output to be compatible in that direction 
anyway, so I think that's fine.


The quick fix would be to add an exception for blobs, close to where 
Assert is. There are a few exceptions there already. A cleaner solution 
would be to add a new argument to ArchiveEntry and make the callers 
responsible for providing an DROP IF EXISTS query, but that's not too 
appetizing because for most callers it would be boring boilerplate code. 
Perhaps add an argument, but if it's NULL, ArchiveEntry would form the 
if-exists query automatically from the DROP query.


- Heikki



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Jeff Davis
I think the hash-join like approach is reasonable, but I also think
you're going to run into a lot of challenges that make it more complex
for HashAgg. For instance, let's say you have the query:

  SELECT x, array_agg(y) FROM foo GROUP BY x;

Say the transition state is an array (for the sake of simplicity), so
the hash table has something like:

  1000 = {7,   8,  9}
  1001 = {12, 13, 14}

You run out of memory and need to split the hash table, so you scan the
hash table and find that group 1001 needs to be written to disk. So you
serialize the key and array and write them out.

Then the next tuple you get is (1001, 19). What do you do? Create a new
group 1001 = {19} (how do you combine it later with the first one)? Or
try to fetch the existing group 1001 from disk and advance it (horrible
random I/O)?



On Wed, 2014-08-13 at 12:31 +0200, Tomas Vondra wrote:
 My understanding of the batching algorithm (and I may be wrong on this
 one) is that once you choose the number of batches, it's pretty much
 fixed. Is that the case?

It's only fixed for that one work item (iteration). A different K can
be selected if memory is exhausted again. But you're right: this is a
little less flexible than HashJoin.

 But what will happen in case of significant cardinality underestimate?
 I.e. what will happen if you decide to use 16 batches, and then find
 out 256 would be more appropriate? I believe you'll end up with batches
 16x the size you'd want, most likely exceeding work_mem.

Yes, except that work_mem would never be exceeded. If the partitions are
16X work_mem, then each would be added as another work_item, and
hopefully it would choose better the next time.

  One thing I like about my simple approach is that it returns a good
  number of groups after each pass, and then those are completely finished
  (returned to the operator above, even). That's impossible with HashJoin
  because the hashing all needs to be done before the probe phase begins.
 
 The hash-join approach returns ~1/N groups after each pass, so I fail to
 see how this is better?

You can't return any tuples until you begin the probe phase, and that
doesn't happen until you've hashed the entire inner side (which involves
splitting and other work). With my patch, it will return some tuples
after the first scan. Perhaps I'm splitting hairs here, but the idea of
finalizing some groups as early as possible seems appealing.

 Aha! And the new batches are 'private' to the work item, making it a bit
 recursive, right? Is there any reason not to just double the number of
 batches globally?

I didn't quite follow this proposal.

 It also seems to me that using HASH_DISK_MAX_PARTITIONS, and then allowing
 each work item to create it's own set of additional partitions effectively
 renders the HASH_DISK_MAX_PARTITIONS futile.

It's the number of active partitions that matter, because that's what
causes the random I/O.

Regards,
Jeff Davis





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] WAL format and API changes (9.5)

2014-08-14 Thread Michael Paquier
On Thu, Aug 14, 2014 at 3:04 AM, Alvaro Herrera
alvhe...@2ndquadrant.com wrote:
 Heikki Linnakangas wrote:
 What's with XLogReplayLSN and XLogReplayRecord?  IMO the xlog code has
 more than enough global variables already, it'd be good to avoid two
 more if possible.  Is there no other good way to get this info down to
 whatever it is that needs them?
Yep, they do not look necessary. Looking at the patch, you could get
rid of XLogReplayLSN and XLogReplayRecord by adding two extra argument
to XLogReplayBuffer: one for the LSN of the current record, and a
second for the record pointer. The code just saves those two variables
in the redo loop of StartupXLOG to only reuse them in
XLogReplayBufferExtended, and I saw no code paths in the redo routines
where XLogReplayBuffer is called at places without the LSN position
and the record pointer.

However I think that Heikki introduced those two variables to make the
move to the next patch easier.
Regards,
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Immediate standby promotion

2014-08-14 Thread Fujii Masao
Hi,

I'd like to propose to add new option --immediate to pg_ctl promote.
When this option is set, recovery ignores any WAL data which have not
been replayed yet and exits immediately. Patch attached.

This promotion is faster than normal one but can cause data loss. So
it's useful if we want to switch the server to normal operation as
soon as possible at the expense of durability. Also imagine the case
where, while recovery is being delayed (by a time-delayed standby
which was introduced in 9.4) or paused (by pg_xlog_replay_pause),
you find that subsequent WAL data can cause a disaster to happen
(for example, WAL data indicating an unexpected deletion of
important database). In this case, this immediate promotion can be
used to ignore such problematic WAL data.

With the patch, --immediate option controls whether immediate
promotion is performed or not. OTOH, we can extend -m option
that pg_ctl stop has already used so that it controls also the mode of
promotion. But when we discussed this feature before, someone
disagreed to do that because shutdown mode and promotion mode
are different concepts. For example, if smart is specified as mode,
how should the promotion work? I agree with him, and just added
new separate option. Thought?

Regards,

-- 
Fujii Masao
*** a/doc/src/sgml/high-availability.sgml
--- b/doc/src/sgml/high-availability.sgml
***
*** 620,626  protocol to make nodes agree on a serializable transactional order.
  when commandpg_ctl promote/ is run or a trigger file is found
  (varnametrigger_file/). Before failover,
  any WAL immediately available in the archive or in filenamepg_xlog/ will be
! restored, but no attempt is made to connect to the master.
 /para
/sect2
  
--- 620,636 
  when commandpg_ctl promote/ is run or a trigger file is found
  (varnametrigger_file/). Before failover,
  any WAL immediately available in the archive or in filenamepg_xlog/ will be
! restored by default, but no attempt is made to connect to the master.
! If literal--immediate/ option is specified, commandpg_ctl promote/
! makes recovery ignore any WAL that have not been replayed yet
! and exit immediately. This promotion is faster but can cause data loss.
! So it's useful if you want to switch the server to normal operation
! as soon as possible at the expense of durability.
! While recovery is being delayed (by a time-delayed standby) or paused,
! you may find that subsequent WAL data can cause a disaster to happen
! (for example, WAL data indicating an unexpected deletion of important
! database). In this case, literal--immediate/ option can be used to
! ignore such problematic WAL data.
 /para
/sect2
  
*** a/doc/src/sgml/ref/pg_ctl-ref.sgml
--- b/doc/src/sgml/ref/pg_ctl-ref.sgml
***
*** 93,98  PostgreSQL documentation
--- 93,99 
 arg choice=plainoptionpromote/option/arg
 arg choice=optoption-s/option/arg
 arg choice=optoption-D/option replaceabledatadir/replaceable/arg
+arg choice=optoption--immediate/option/arg
/cmdsynopsis
  
cmdsynopsis
***
*** 404,409  PostgreSQL documentation
--- 405,419 
   /varlistentry
  
   varlistentry
+   termoption--immediate/option/term
+   listitem
+para
+ Exit recovery immediately in promote mode.
+/para
+   /listitem
+  /varlistentry
+ 
+  varlistentry
termoption-?//term
termoption--help//term
listitem
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***
*** 74,79  extern uint32 bootstrap_data_checksum_version;
--- 74,80 
  #define RECOVERY_COMMAND_DONE	recovery.done
  #define PROMOTE_SIGNAL_FILE		promote
  #define FALLBACK_PROMOTE_SIGNAL_FILE fallback_promote
+ #define IMMEDIATE_PROMOTE_SIGNAL_FILE	immediate_promote
  
  
  /* User-settable parameters */
***
*** 240,245  bool		StandbyMode = false;
--- 241,249 
  /* whether request for fast promotion has been made yet */
  static bool fast_promote = false;
  
+ /* whether request for immediate promotion has been made yet */
+ static bool immediate_promote = false;
+ 
  /*
   * if recoveryStopsBefore/After returns true, it saves information of the stop
   * point here
***
*** 6761,6766  StartupXLOG(void)
--- 6765,6777 
  		recoveryPausesHere();
  }
  
+ /*
+  * In immediate promotion, we ignore WAL data that have not
+  * been replayed yet and exit recovery immediately.
+  */
+ if (immediate_promote)
+ 	break;
+ 
  /* Setup error traceback support for ereport() */
  errcallback.callback = rm_redo_error_callback;
  errcallback.arg = (void *) record;
***
*** 11299,11315  CheckForStandbyTrigger(void)
--- 11310,11341 
  		 * Startup process do the unlink. This allows Startup to know whether
  		 * it should create a full checkpoint 

Re: [HACKERS] WAL format and API changes (9.5)

2014-08-14 Thread Michael Paquier
On Thu, Aug 14, 2014 at 2:05 AM, Heikki Linnakangas
hlinnakan...@vmware.com wrote:
 Here's a full version of this refactoring patch, all the rmgr's have now
 been updated to use XLogReplayBuffer(). I think this is a worthwhile change
 on its own, even if we drop the ball on the rest of the WAL format patch,
 because it makes the redo-routines more readable. I propose to commit this
 as soon as someone has reviewed it, and we agree on a for what's currently
 called XLogReplayBuffer(). I have tested this with my page-image comparison
 tool.

I have as well passed this patch through the page comparison tool and
found no problems, with both regression and isolation tests. I also
had a look at the redo routines that are changed and actually found
nothing. Now, what this patch does is not much complicated, it adds a
couple of status flags returned by XLogReplayBuffer. Then, roughly,
when BLK_NEEDS_REDO is returned the previous restore actions are done.
This has the merit to put the LSN check on current page to determine
if a page needs to be replayed inside XLogReplayBuffer.

A couple of minor comments though:
1) Why changing that from ERROR to PANIC?
/* Caller specified a bogus block_index */
-   elog(ERROR, failed to restore block_index %d, block_index);
+   elog(PANIC, failed to restore block_index %d, block_index);
2) There are some whitespaces, like here:
+   {
+   destPage = NULL;/* don't do any page updates */
}

Regards,
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] delta relations in AFTER triggers

2014-08-14 Thread Amit Khandekar
On 12 August 2014 20:09, Kevin Grittner kgri...@ymail.com wrote:
 Amit Khandekar amit.khande...@enterprisedb.com wrote:
 On 7 August 2014 19:49, Kevin Grittner kgri...@ymail.com wrote:
 Amit Khandekar amit.khande...@enterprisedb.com wrote:

 I tried to google some SQLs that use REFERENCING clause with triggers.
 It looks like in some database systems, even the WHEN clause of CREATE 
 TRIGGER
 can refer to a transition table, just like how it refers to NEW and
 OLD row variables.

 For e.g. :
 CREATE TRIGGER notify_dept
   AFTER UPDATE ON weather
   REFERENCING NEW_TABLE AS N_TABLE
   NEW AS N_ROW
   FOR EACH ROW
   WHEN ((SELECT AVG (temperature) FROM N_TABLE)  10)
   BEGIN
 notify_department(N_ROW.temperature, N_ROW.city);
   END

 Above, it is used to get an aggregate value of all the changed rows. I 
 think
 we do not currently support aggregate expressions in the where clause, but 
 with
 transition tables, it makes more sense to support it later if not now.

 Interesting point; I had not thought about that.  Will see if I can
 include support for that in the patch for the next CF; failing
 that; I will at least be careful to not paint myself into a corner
 where it is unduly hard to do later.
 We currently do the WHEN checks while saving the AFTER trigger events,
 and also add the tuples one by one while saving the trigger events. If
 and when we support WHEN, we would need to make all of these tuples
 saved *before* the first WHEN clause execution, and that seems to
 demand more changes in the current trigger code.

 In that case my inclination is to get things working with the less
 invasive patch that doesn't try to support transition table usage
 in WHEN clauses, and make support for that a later patch.
Agreed.

 ---

 Are we later going to extend this support for constraint triggers as
 well ? I think these variables would make sense even for deferred
 constraint triggers. I think we would need some more changes if we
 want to support this, because there is no query depth while executing
 deferred triggers and so the tuplestores might be inaccessible with
 the current design.

 Hmm, I would also prefer to exclude that from an initial patch, but
 this and the WHEN clause issue may influence a decision I've been
 struggling with.  This is my first non-trivial foray into the
 planner territory, and I've been struggling with how best to bridge
 the gap between where the tuplestores are *captured* in the trigger
 code and where they are referenced by name in a query and
 incorporated into a plan for the executor.  (The execution level
 itself was almost trivial; it's getting the tuplestore reference
 through the parse analysis and planning phases that is painful for
 me.
I am not sure why you think we would need to refer the tuplestore in
the parse analysis and planner phases. It seems that we would need
them only in execution phase. Or may be I didn't understand your
point.

 )  At one point I create a tuplestore registry using a
 process-local hashmap where each Tuplestorestate and its associated
 name, TupleDesc, etc. would be registered, yielding a Tsrid (based
 on Oid) to use through the parse analysis and planning steps, but
 then I ripped it back out again in favor of just passing the
 pointer to the structure which was stored in the registry; because
 the registry seemed to me to introduce more risk of memory leaks,
 references to freed memory, etc.  While it helped a little with
 abstraction, it seemed to make things more fragile.  But I'm still
 torn on this, and unsure whether such a registry is a good idea.
I feel it is ok to use direct tuplestore pointers as handles like how
you have done in the patch. I may be biased with doing that as against
the above method of accessing tuplestore by its name through hash
lookup; the reason of my bias might be because of one particular way I
see how deferred constraint triggers can be supported. In the after
trigger event structure, we can store these delta tuplestores pointers
as well. This way, we don't need to worry about how to lookup these
tuplestores, and also need not worry about the mechanism that moves
these events from deferred event list to immediate event list in case
of SET CONSTRAINTS. Only thing we would need to make sure is to
cleanup these tuplestores exactly where the event structures get
cleaned up.

This is all my speculations. But what I think is that we don't have to
heavily refactor your patch changes in order to extend support for
deferred constraint triggers. And for WHEN clause, we anyways have to
do some changes in the existing trigger code.


 ---

 The following (and similarly other) statements :
 trigdesc-trig_insert_new_table |=
 (TRIGGER_FOR_INSERT(tgtype) 
 TRIGGER_USES_TRANSITION_TABLE(trigger-tgnewtable)) ? true : false;

 can be simplfied to :

 trigdesc-trig_insert_new_table |=
 (TRIGGER_FOR_INSERT(tgtype) 
 TRIGGER_USES_TRANSITION_TABLE(trigger-tgnewtable));

 Yeah, I did recognize 

Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Heikki Linnakangas

On 08/14/2014 04:01 AM, Tom Lane wrote:

I wrote:

That's a fair question.  I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-).  I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression.  Remembering that the textual representation is 12353 bytes:



json:   382 bytes
jsonb, using offsets:   12593 bytes
jsonb, using lengths:   406 bytes


Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes.  So that's better, but still more than a
3X penalty compared to using lengths.  (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)


For comparison, here's a patch that implements the scheme that Alexander 
Korotkov suggested, where we store an offset every 8th element, and a 
length in the others. It compresses Larry's example to 525 bytes. 
Increasing the stride from 8 to 16 entries, it compresses to 461 bytes.


A nice thing about this patch is that it's on-disk compatible with the 
current format, hence initdb is not required.


(The current comments claim that the first element in an array always 
has the JENTRY_ISFIRST flags set; that is wrong, there is no such flag. 
I removed the flag in commit d9daff0e, but apparently failed to update 
the comment and the accompanying JBE_ISFIRST macro. Sorry about that, 
will fix. This patch uses the bit that used to be JENTRY_ISFIRST to mark 
entries that store a length instead of an end offset.).


- Heikki

diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..47b2998 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1378,8 +1378,10 @@ convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level
 	 errmsg(total size of jsonb array elements exceeds the maximum of %u bytes,
 			JENTRY_POSMASK)));
 
-		if (i  0)
+		if (i % JBE_STORE_LEN_STRIDE == 0)
 			meta = (meta  ~JENTRY_POSMASK) | totallen;
+		else
+			meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 	}
@@ -1430,11 +1432,14 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
 	 errmsg(total size of jsonb array elements exceeds the maximum of %u bytes,
 			JENTRY_POSMASK)));
 
-		if (i  0)
+		if (i % JBE_STORE_LEN_STRIDE == 0)
 			meta = (meta  ~JENTRY_POSMASK) | totallen;
+		else
+			meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 
+		/* put value */
 		convertJsonbValue(buffer, meta, pair-value, level);
 		len = meta  JENTRY_POSMASK;
 		totallen += len;
@@ -1445,7 +1450,7 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
 	 errmsg(total size of jsonb array elements exceeds the maximum of %u bytes,
 			JENTRY_POSMASK)));
 
-		meta = (meta  ~JENTRY_POSMASK) | totallen;
+		meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 	}
@@ -1592,3 +1597,39 @@ uniqueifyJsonbObject(JsonbValue *object)
 		object-val.object.nPairs = res + 1 - object-val.object.pairs;
 	}
 }
+
+uint32
+jsonb_get_offset(const JEntry *ja, int index)
+{
+	uint32		off = 0;
+	int			i;
+
+	/*
+	 * Each absolute entry contains the *end* offset. Start offset of this
+	 * entry is equal to the end offset of the previous entry.
+	 */
+	for (i = index - 1; i = 0; i--)
+	{
+		off += JBE_POSFLD(ja[i]);
+		if (!JBE_HAS_LEN(ja[i]))
+			break;
+	}
+	return off;
+}
+
+uint32
+jsonb_get_length(const JEntry *ja, int index)
+{
+	uint32		off;
+	uint32		len;
+
+	if (JBE_HAS_LEN(ja[index]))
+		len = JBE_POSFLD(ja[index]);
+	else
+	{
+		off = jsonb_get_offset(ja, index);
+		len = JBE_POSFLD(ja[index]) - off;
+	}
+
+	return len;
+}
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 5f2594b..dae6512 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -102,12 +102,12 @@ typedef struct JsonbValue JsonbValue;
  * to JB_FSCALAR | JB_FARRAY.
  *
  * To encode the length and offset of the variable-length portion of each
- * node in a compact way, the JEntry stores only the end offset within the
- * variable-length portion of the container node. For the first JEntry in the
- * container's JEntry array, that equals to the length of the node data. For
- * convenience, the JENTRY_ISFIRST flag is set. The begin offset and length
- * of the rest of the entries can be calculated using the end offset of the
- * previous JEntry in the array.
+ * 

Re: [HACKERS] WAL format and API changes (9.5)

2014-08-14 Thread Heikki Linnakangas

On 08/14/2014 10:29 AM, Michael Paquier wrote:

On Thu, Aug 14, 2014 at 3:04 AM, Alvaro Herrera
alvhe...@2ndquadrant.com wrote:

Heikki Linnakangas wrote:
What's with XLogReplayLSN and XLogReplayRecord?  IMO the xlog code has
more than enough global variables already, it'd be good to avoid two
more if possible.  Is there no other good way to get this info down to
whatever it is that needs them?

Yep, they do not look necessary. Looking at the patch, you could get
rid of XLogReplayLSN and XLogReplayRecord by adding two extra argument
to XLogReplayBuffer: one for the LSN of the current record, and a
second for the record pointer. The code just saves those two variables
in the redo loop of StartupXLOG to only reuse them in
XLogReplayBufferExtended, and I saw no code paths in the redo routines
where XLogReplayBuffer is called at places without the LSN position
and the record pointer.

However I think that Heikki introduced those two variables to make the
move to the next patch easier.


The next patch doesn't necessary require them either, you could always 
pass the LSN and record as an argument. I wanted to avoid that, because 
every redo function would just pass the current record being replayed, 
so it seems nicer to pass that information out-of-band. I guess if we 
do that, though, we should remove those arguments from rm_redo interface 
altogether, and always rely on the global variables to get the current 
record or its LSN. I'm not wedded on this, I could be persuaded to go 
either way...


- Heikki


--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] WAL format and API changes (9.5)

2014-08-14 Thread Heikki Linnakangas

On 08/14/2014 11:22 AM, Michael Paquier wrote:

1) Why changing that from ERROR to PANIC?
 /* Caller specified a bogus block_index */
-   elog(ERROR, failed to restore block_index %d, block_index);
+   elog(PANIC, failed to restore block_index %d, block_index);


No reason, just a leftover from debugging. Please ignore.

- Heikki


--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] WAL format and API changes (9.5)

2014-08-14 Thread Andres Freund
On 2014-08-14 12:41:35 +0300, Heikki Linnakangas wrote:
 On 08/14/2014 10:29 AM, Michael Paquier wrote:
 On Thu, Aug 14, 2014 at 3:04 AM, Alvaro Herrera
 alvhe...@2ndquadrant.com wrote:
 Heikki Linnakangas wrote:
 What's with XLogReplayLSN and XLogReplayRecord?  IMO the xlog code has
 more than enough global variables already, it'd be good to avoid two
 more if possible.  Is there no other good way to get this info down to
 whatever it is that needs them?
 Yep, they do not look necessary. Looking at the patch, you could get
 rid of XLogReplayLSN and XLogReplayRecord by adding two extra argument
 to XLogReplayBuffer: one for the LSN of the current record, and a
 second for the record pointer. The code just saves those two variables
 in the redo loop of StartupXLOG to only reuse them in
 XLogReplayBufferExtended, and I saw no code paths in the redo routines
 where XLogReplayBuffer is called at places without the LSN position
 and the record pointer.
 
 However I think that Heikki introduced those two variables to make the
 move to the next patch easier.
 
 The next patch doesn't necessary require them either, you could always pass
 the LSN and record as an argument. I wanted to avoid that, because every
 redo function would just pass the current record being replayed, so it seems
 nicer to pass that information out-of-band. I guess if we do that, though,
 we should remove those arguments from rm_redo interface altogether, and
 always rely on the global variables to get the current record or its LSN.
 I'm not wedded on this, I could be persuaded to go either way...

I personally find the out of band transport really ugly.

Greetings,

Andres Freund

-- 
 Andres Freund http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] delta relations in AFTER triggers

2014-08-14 Thread Amit Khandekar
 The execution level
 itself was almost trivial; it's getting the tuplestore reference
 through the parse analysis and planning phases that is painful for
 me.
 I am not sure why you think we would need to refer the tuplestore in
 the parse analysis and planner phases. It seems that we would need
 them only in execution phase. Or may be I didn't understand your
 point.
Ah I think I understand now. That might be because you are thinking of
having an infrastructure common to triggers and materialized views,
right ?


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] What happened to jsonb's JENTRY_ISFIRST?

2014-08-14 Thread Heikki Linnakangas

On 08/14/2014 02:45 AM, Peter Geoghegan wrote:

On Wed, Aug 13, 2014 at 3:47 PM, Tom Lane t...@sss.pgh.pa.us wrote:

If the bit is unused now, should we be worrying about reclaiming it for
better use?  Like say allowing jsonb's to be larger than just a quarter
of the maximum datum size?


Commit d9daff0e0cb15221789e6c50d9733c8754c054fb removed it. This is an
obsolete comment.


Yeah. I just noticed the same and replied in the other thread 
(http://www.postgresql.org/message-id/53ec8194.4020...@vmware.com). Note 
to self: check all the mails in inbox before replying..


- Heikki



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Support for N synchronous standby servers

2014-08-14 Thread Fujii Masao
On Wed, Aug 13, 2014 at 4:10 PM, Michael Paquier
michael.paqu...@gmail.com wrote:
 On Wed, Aug 13, 2014 at 2:10 PM, Fujii Masao masao.fu...@gmail.com wrote:
 I sent the SIGSTOP signal to the walreceiver process in one of sync standbys,
 and then ran write transactions again. In this case, they must not be 
 completed
 because their WAL cannot be replicated to the standby that its walreceiver
 was stopped. But they were successfully completed.

 At the end of SyncRepReleaseWaiters, SYNC_REP_WAIT_WRITE and
 SYNC_REP_WAIT_FLUSH in walsndctl were able to update with only one wal
 sender in sync, making backends wake up even if other standbys did not
 catch up. But we need to scan all the synchronous wal senders and find
 the minimum write and flush positions and update walsndctl with those
 values. Well that's a code path I forgot to cover.

 Attached is an updated patch fixing the problem you reported.

+At any one time there will be at a number of active
synchronous standbys
+defined by varnamesynchronous_standby_num/; transactions waiting

It's better to use xref linkend=guc-synchronous-standby-num, instead.

+for commit will be allowed to proceed after those standby servers
+confirms receipt of their data. The synchronous standbys will be

Typo: confirms - confirm

+   para
+Specifies the number of standbys that support
+firsttermsynchronous replication/, as described in
+xref linkend=synchronous-replication, and listed as the first
+elements of xref linkend=guc-synchronous-standby-names.
+   /para
+   para
+Default value is 1.
+   /para

synchronous_standby_num is defined with PGC_SIGHUP. So the following
should be added into the document.

This parameter can only be set in the postgresql.conf file or on
the server command line.

The name of the parameter synchronous_standby_num sounds to me that
the transaction must wait for its WAL to be replicated to s_s_num standbys.
But that's not true in your patch. If s_s_names is empty, replication works
asynchronously whether the value of s_s_num is. I'm afraid that it's confusing.

The description of s_s_num is not sufficient. I'm afraid that users can easily
misunderstand that they can use quorum commit feature by using s_s_names
and s_s_num. That is, the transaction waits for its WAL to be replicated to
any s_s_num standbys listed in s_s_names.

When s_s_num is set to larger value than max_wal_senders, we should warn that?

+for (i = 0; i  num_sync; i++)
+{
+volatile WalSnd *walsndloc = WalSndCtl-walsnds[sync_standbys[i]];
+
+if (min_write_pos  walsndloc-write)
+min_write_pos = walsndloc-write;
+if (min_flush_pos  walsndloc-flush)
+min_flush_pos = walsndloc-flush;
+}

I don't think that it's safe to see those shared values without spinlock.

Regards,

-- 
Fujii Masao


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] add line number as prompt option to psql

2014-08-14 Thread Sawada Masahiko
On Sat, Jul 12, 2014 at 2:19 AM, Alvaro Herrera
alvhe...@2ndquadrant.com wrote:
 Sawada Masahiko wrote:

 As you said, if line number reached UINT_MAX then I think that this
 case is too strange.
 I think INT_MAX is enough for line number.

 My point is not whether 2 billion is a better number than 4 billion as a
 maximum value.  My point is that wraparound of signed int is, I think,
 not even defined in C, whereas wraparound of unsigned int is well
 defined.  cur_line should be declared as unsigned int.  I don't trust
 that INT_MAX+2 arithmetic.

 Please don't use cur_line as a name for a global variable.  Something
 like PSQLLineNumber seems more appropriate if it's going to be exposed
 through prompt.h.  However, note that MainLoop() keeps state in local
 variables and notes that it is reentrant; what happens to your cur_line
 when a file is read by \i and similar?  I wonder if it should be part of
 PsqlScanStateData instead ...


Thank you for comment.
I restarted to make this patch again.

Attached patch is new version patch, and rebased.
pset structure has cur_lineno variable which shows current line number
as unsigned int64.

Regards,

---
Sawada Masahiko


psql-line-number_v6.patch
Description: Binary data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Improvement of versioning on Windows, take two

2014-08-14 Thread MauMau
I confirmed that all issues are solved.  The patch content looks good, 
alghouth I'm not confident in Perl.  I marked this patch as ready for 
committer.  I didn't try the patch on MinGW.


Regards
MauMau



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] pgbench throttling latency limit

2014-08-14 Thread Fabien COELHO


Add --limit to limit latency under throttling

Under throttling, transactions are scheduled for execution at certain 
times. Transactions may be far behind schedule and the system may catch up 
with the load later. This option allows to change this behavior by 
skipping transactions which are too far behind schedule, and count those 
as skipped.


The idea is to help simulate a latency-constrained environment such as a 
database used by a web server.


--
Fabien.diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 2f7d80e..3d43321 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -141,6 +141,13 @@ double		sample_rate = 0.0;
 int64		throttle_delay = 0;
 
 /*
+ * When under throttling, execution time slots which are more than
+ * this late (in us) are skipped, and the corresponding transaction
+ * will be counted as somehow aborted.
+ */
+int64		throttle_latency_limit = 0;
+
+/*
  * tablespace selection
  */
 char	   *tablespace = NULL;
@@ -238,6 +245,7 @@ typedef struct
 	int64		throttle_trigger;		/* previous/next throttling (us) */
 	int64		throttle_lag;	/* total transaction lag behind throttling */
 	int64		throttle_lag_max;		/* max transaction lag */
+	int64		throttle_latency_skipped; /* lagging transactions skipped */
 } TState;
 
 #define INVALID_THREAD		((pthread_t) 0)
@@ -250,6 +258,7 @@ typedef struct
 	int64		sqlats;
 	int64		throttle_lag;
 	int64		throttle_lag_max;
+	int64		throttle_latency_skipped;
 } TResult;
 
 /*
@@ -367,6 +376,8 @@ usage(void)
 		   -f, --file=FILENAME  read transaction script from FILENAME\n
 		 -j, --jobs=NUM   number of threads (default: 1)\n
 		 -l, --logwrite transaction times to log file\n
+		 -L, --limit=NUM  under throttling (--rate), skip transactions that\n
+		  far behind schedule in ms (default: do not skip)\n
 		 -M, --protocol=simple|extended|prepared\n
 		  protocol for submitting queries (default: simple)\n
 		 -n, --no-vacuum  do not run VACUUM before tests\n
@@ -1016,6 +1027,24 @@ top:
 
 		thread-throttle_trigger += wait;
 
+		if (throttle_latency_limit)
+		{
+			instr_time	now;
+			int64		now_us;
+			INSTR_TIME_SET_CURRENT(now);
+			now_us = INSTR_TIME_GET_MICROSEC(now);
+			while (thread-throttle_trigger  now_us - throttle_latency_limit)
+			{
+/* if too far behind, this slot is skipped, and we
+ * iterate till the next nearly on time slot.
+ */
+int64 wait = (int64) (throttle_delay *
+	1.00055271703 * -log(getrand(thread, 1, 1) / 1.0));
+thread-throttle_trigger += wait;
+thread-throttle_latency_skipped ++;
+			}
+		}
+
 		st-until = thread-throttle_trigger;
 		st-sleeping = 1;
 		st-throttling = true;
@@ -2351,7 +2380,8 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 			 TState *threads, int nthreads,
 			 instr_time total_time, instr_time conn_total_time,
 			 int64 total_latencies, int64 total_sqlats,
-			 int64 throttle_lag, int64 throttle_lag_max)
+			 int64 throttle_lag, int64 throttle_lag_max,
+			 int64 throttle_latency_skipped)
 {
 	double		time_include,
 tps_include,
@@ -2417,6 +2447,10 @@ printResults(int ttype, int64 normal_xacts, int nclients,
 		 */
 		printf(rate limit schedule lag: avg %.3f (max %.3f) ms\n,
 			   0.001 * throttle_lag / normal_xacts, 0.001 * throttle_lag_max);
+		if (throttle_latency_limit)
+			printf(number of skipped transactions:  INT64_FORMAT  (%.3f %%)\n,
+   throttle_latency_skipped,
+   100.0 * throttle_latency_skipped / normal_xacts);
 	}
 
 	printf(tps = %f (including connections establishing)\n, tps_include);
@@ -2505,6 +2539,7 @@ main(int argc, char **argv)
 		{sampling-rate, required_argument, NULL, 4},
 		{aggregate-interval, required_argument, NULL, 5},
 		{rate, required_argument, NULL, 'R'},
+		{limit, required_argument, NULL, 'L'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2534,6 +2569,7 @@ main(int argc, char **argv)
 	int64		total_sqlats = 0;
 	int64		throttle_lag = 0;
 	int64		throttle_lag_max = 0;
+	int64		throttle_latency_skipped = 0;
 
 	int			i;
 
@@ -2775,6 +2811,18 @@ main(int argc, char **argv)
 	throttle_delay = (int64) (100.0 / throttle_value);
 }
 break;
+			case 'L':
+{
+	double limit_ms = atof(optarg);
+	if (limit_ms = 0.0)
+	{
+		fprintf(stderr, invalid latency limit: %s\n, optarg);
+		exit(1);
+	}
+	benchmarking_option_set = true;
+	throttle_latency_limit = (int64) (limit_ms * 1000);
+}
+break;
 			case 0:
 /* This covers long options which take no argument. */
 if (foreign_keys || unlogged_tables)
@@ -2895,6 +2943,12 @@ main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (throttle_latency_limit != 0  throttle_delay == 0)
+	{
+		fprintf(stderr, latency limit (-L) can only be specified with throttling (-R)\n);
+		exit(1);
+	}
+
 	/*
 	 * is_latencies only works with multiple threads in thread-based
 	 * 

Re: [HACKERS] pg_dump bug in 9.4beta2 and HEAD

2014-08-14 Thread Alvaro Herrera
Heikki Linnakangas wrote:
 On 08/14/2014 06:53 AM, Joachim Wieland wrote:
 I'm seeing an assertion failure with pg_dump -c --if-exists which is
 not ready to handle BLOBs it seems:
 
 pg_dump: pg_backup_archiver.c:472: RestoreArchive: Assertion `mark !=
 ((void *)0)' failed.
 
 To reproduce:
 
 $ createdb test
 $ pg_dump -c --if-exists test  (works, dumps empty database)
 $ psql test -c select lo_create(1);
 $ pg_dump -c --if-exists test  (fails, with the above mentioned assertion)
 
 The code tries to inject an IF EXISTS into the already-construct
 DROP command, but it doesn't work for large objects, because the
 deletion command looks like SELECT pg_catalog.lo_unlink(xxx).
 There is no DROP there.

Ah, so this was broken by 9067310cc5dd590e36c2c3219dbf3961d7c9f8cb.
Pavel, any thoughts here?  Can you provide a fix?

We already have a couple of object-type-specific checks in there, so I
think it's okay to add one more exception for large objects.

-- 
Álvaro Herrerahttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training  Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] pg_dump bug in 9.4beta2 and HEAD

2014-08-14 Thread Pavel Stehule
2014-08-14 15:11 GMT+02:00 Alvaro Herrera alvhe...@2ndquadrant.com:

 Heikki Linnakangas wrote:
  On 08/14/2014 06:53 AM, Joachim Wieland wrote:
  I'm seeing an assertion failure with pg_dump -c --if-exists which is
  not ready to handle BLOBs it seems:
  
  pg_dump: pg_backup_archiver.c:472: RestoreArchive: Assertion `mark !=
  ((void *)0)' failed.
  
  To reproduce:
  
  $ createdb test
  $ pg_dump -c --if-exists test  (works, dumps empty database)
  $ psql test -c select lo_create(1);
  $ pg_dump -c --if-exists test  (fails, with the above mentioned
 assertion)
 
  The code tries to inject an IF EXISTS into the already-construct
  DROP command, but it doesn't work for large objects, because the
  deletion command looks like SELECT pg_catalog.lo_unlink(xxx).
  There is no DROP there.

 Ah, so this was broken by 9067310cc5dd590e36c2c3219dbf3961d7c9f8cb.
 Pavel, any thoughts here?  Can you provide a fix?

 We already have a couple of object-type-specific checks in there, so I
 think it's okay to add one more exception for large objects.


i will prepare this fix today

regards

Pavel



 --
 Álvaro Herrerahttp://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



Re: Compute attr_needed for child relations (was Re: [HACKERS] inherit support for foreign tables)

2014-08-14 Thread Ashutosh Bapat
Hi,



On Thu, Aug 14, 2014 at 10:05 AM, Etsuro Fujita fujita.ets...@lab.ntt.co.jp
 wrote:

 (2014/08/08 18:51), Etsuro Fujita wrote:
  (2014/06/30 22:48), Tom Lane wrote:
  I wonder whether it isn't time to change that.  It was coded like that
  originally only because calculating the values would've been a waste
 of
  cycles at the time.  But this is at least the third place where it'd
 be
  useful to have attr_needed for child rels.

  I've revised the patch.

 There was a problem with the previous patch, which will be described
 below.  Attached is the updated version of the patch addressing that.

 The previous patch doesn't cope with some UNION ALL cases properly.  So,
 e.g., the server will crash for the following query:

 postgres=# create table ta1 (f1 int);
 CREATE TABLE
 postgres=# create table ta2 (f2 int primary key, f3 int);
 CREATE TABLE
 postgres=# create table tb1 (f1 int);
 CREATE TABLE
 postgres=# create table tb2 (f2 int primary key, f3 int);
 CREATE TABLE
 postgres=# explain verbose select f1 from ((select f1, f2 from (select
 f1, f2, f3 from ta1 left join ta2 on f1 = f2 limit 1) ssa) union all
 (select f1,
 f2 from (select f1, f2, f3 from tb1 left join tb2 on f1 = f2 limit 1)
 ssb)) ss;

 With the updated version, we get the right result:

 postgres=# explain verbose select f1 from ((select f1, f2 from (select
 f1, f2, f3 from ta1 left join ta2 on f1 = f2 limit 1) ssa) union all
 (select f1,
 f2 from (select f1, f2, f3 from tb1 left join tb2 on f1 = f2 limit 1)
 ssb)) ss;
QUERY PLAN

 
  Append  (cost=0.00..0.05 rows=2 width=4)
-  Subquery Scan on ssa  (cost=0.00..0.02 rows=1 width=4)
  Output: ssa.f1
  -  Limit  (cost=0.00..0.01 rows=1 width=4)
Output: ta1.f1, (NULL::integer), (NULL::integer)
-  Seq Scan on public.ta1  (cost=0.00..34.00 rows=2400
 width=4)
  Output: ta1.f1, NULL::integer, NULL::integer
-  Subquery Scan on ssb  (cost=0.00..0.02 rows=1 width=4)
  Output: ssb.f1
  -  Limit  (cost=0.00..0.01 rows=1 width=4)
Output: tb1.f1, (NULL::integer), (NULL::integer)
-  Seq Scan on public.tb1  (cost=0.00..34.00 rows=2400
 width=4)
  Output: tb1.f1, NULL::integer, NULL::integer
  Planning time: 0.453 ms
 (14 rows)

 While thinking to address this problem, Ashutosh also expressed concern
 about the UNION ALL handling in the previous patch in a private email.
 Thank you for the review, Ashutosh!


Thanks for taking care of this one.

Here are some more comments

attr_needed also has the attributes used in the restriction clauses as seen
in distribute_qual_to_rels(), so, it looks unnecessary to call
pull_varattnos() on the clauses in baserestrictinfo in functions
check_selective_binary_conversion(), remove_unused_subquery_outputs(),
check_index_only().

Although in case of RTE_RELATION, the 0th entry in attr_needed corresponds
to FirstLowInvalidHeapAttributeNumber + 1, it's always safer to use it is
RelOptInfo::min_attr, in case get_relation_info() wants to change
assumption or somewhere down the line some other part of code wants to
change attr_needed information. It may be unlikely, that it would change,
but it will be better to stick to comments in RelOptInfo
 443 AttrNumber  min_attr;   /* smallest attrno of rel (often 0) */
 444 AttrNumber  max_attr;   /* largest attrno of rel */
 445 Relids *attr_needed;/* array indexed [min_attr ..
max_attr] */


 Thanks,

 Best regards,
 Etsuro Fujita




-- 
Best Wishes,
Ashutosh Bapat
EnterpriseDB Corporation
The Postgres Database Company


Re: [HACKERS] pg_dump bug in 9.4beta2 and HEAD

2014-08-14 Thread Alvaro Herrera
Heikki Linnakangas wrote:

 The quick fix would be to add an exception for blobs, close to where
 Assert is. There are a few exceptions there already. A cleaner
 solution would be to add a new argument to ArchiveEntry and make the
 callers responsible for providing an DROP IF EXISTS query, but
 that's not too appetizing because for most callers it would be
 boring boilerplate code. Perhaps add an argument, but if it's NULL,
 ArchiveEntry would form the if-exists query automatically from the
 DROP query.

Yeah, this was discussed (or at least mentioned) in the original thread.
See
http://www.postgresql.org/message-id/20140228183100.gu4...@eldon.alvh.no-ip.org
where I wrote:

: I still find the code to inject IF EXISTS to the DROP commands ugly as
: sin.  I would propose to stop storing the dropStmt in the archive
: anymore; instead just store the object identity, which can later be used
: to generate both DROP commands, with or without IF EXISTS, and the ALTER
: OWNER commands.  However, that's a larger project and I don't think we
: need to burden this patch with that.

-- 
Álvaro Herrerahttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training  Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] replication commands and log_statements

2014-08-14 Thread Stephen Frost
Amit,

* Amit Kapila (amit.kapil...@gmail.com) wrote:
 On Thu, Aug 14, 2014 at 5:56 AM, Stephen Frost sfr...@snowman.net wrote:
  Regarding this, I'm generally in the camp that says to just include it
  in 'all' and be done with it- for now.
 
 Okay, but tomorrow if someone wants to implement a patch to log
 statements executed through SPI (statements inside functions), then
 what will be your suggestion, does those also can be allowed to log
 with 'all' option or you would like to suggest him to wait for a proper
 auditing system?

No, I'd suggest having a different option for it as that would be a huge
change for people who are already doing 'all', potentially.  Adding the
replication commands is extremely unlikely to cause individuals who are
already logging 'all' any problems, as far as I can tell.

 Wouldn't allowing to log everything under 'all' option can start
 confusing some users without having individual
 (ddl, mod, replication, ...) options for different kind of statements.

I don't see logging replication commands under 'all' as confusing, no.

Thanks,

Stephen


signature.asc
Description: Digital signature


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tom Lane
Jeff Davis pg...@j-davis.com writes:
 I think the hash-join like approach is reasonable, but I also think
 you're going to run into a lot of challenges that make it more complex
 for HashAgg. For instance, let's say you have the query:

   SELECT x, array_agg(y) FROM foo GROUP BY x;

 Say the transition state is an array (for the sake of simplicity), so
 the hash table has something like:

   1000 = {7,   8,  9}
   1001 = {12, 13, 14}

 You run out of memory and need to split the hash table, so you scan the
 hash table and find that group 1001 needs to be written to disk. So you
 serialize the key and array and write them out.

 Then the next tuple you get is (1001, 19). What do you do? Create a new
 group 1001 = {19} (how do you combine it later with the first one)? Or
 try to fetch the existing group 1001 from disk and advance it (horrible
 random I/O)?

If you're following the HashJoin model, then what you do is the same thing
it does: you write the input tuple back out to the pending batch file for
the hash partition that now contains key 1001, whence it will be processed
when you get to that partition.  I don't see that there's any special case
here.

The fly in the ointment is how to serialize a partially-computed aggregate
state value to disk, if it's not of a defined SQL type.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] psql \watch versus \timing

2014-08-14 Thread Fujii Masao
On Mon, May 20, 2013 at 7:33 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Jeff Janes jeff.ja...@gmail.com writes:
 I'd like to run same query repeatedly and see how long it takes each time.
 I thought \watch would be excellent for this, but it turns out that using
 \watch suppresses the output of \timing.

 Is this intentional, or unavoidable?

 \watch uses PSQLexec not SendQuery; the latter implements \timing which
 I agree is arguably useful here, but also autocommit/auto-savepoint
 behavior which probably isn't a good idea.

 It might be a good idea to refactor those two routines into one routine
 with some sort of bitmap flags argument to control the various add-on
 behaviors, but that seems like not 9.3 material anymore.

Attached patch changes \watch so that it displays how long the query takes
if \timing is enabled.

I didn't refactor PSQLexec and SendQuery into one routine because
the contents of those functions are not so same. I'm not sure how much
it's worth doing that refactoring. Anyway this feature is quite useful
even without that refactoring, I think.

BTW, I found that \watch doesn't check for async notifications. Is it useful
to allow \watch to do that? ISTM that it's not so bad idea to use \timing
to continuously check for async notifications. No?

Regards,

-- 
Fujii Masao
*** a/src/bin/psql/command.c
--- b/src/bin/psql/command.c
***
*** 2690,2695  do_watch(PQExpBuffer query_buf, long sleep)
--- 2690,2696 
  		PGresult   *res;
  		time_t		timer;
  		long		i;
+ 		double	elapsed_msec = 0;
  
  		/*
  		 * Prepare title for output.  XXX would it be better to use the time
***
*** 2701,2710  do_watch(PQExpBuffer query_buf, long sleep)
  		myopt.title = title;
  
  		/*
! 		 * Run the query.  We use PSQLexec, which is kind of cheating, but
! 		 * SendQuery doesn't let us suppress autocommit behavior.
  		 */
! 		res = PSQLexec(query_buf-data, false);
  
  		/* PSQLexec handles failure results and returns NULL */
  		if (res == NULL)
--- 2702,2711 
  		myopt.title = title;
  
  		/*
! 		 * Run the query.  We use PSQLexecInternal, which is kind of cheating,
! 		 * but SendQuery doesn't let us suppress autocommit behavior.
  		 */
! 		res = PSQLexecInternal(query_buf-data, false, elapsed_msec);
  
  		/* PSQLexec handles failure results and returns NULL */
  		if (res == NULL)
***
*** 2755,2760  do_watch(PQExpBuffer query_buf, long sleep)
--- 2756,2765 
  
  		fflush(pset.queryFout);
  
+ 		/* Possible microtiming output */
+ 		if (pset.timing)
+ 			printf(_(Time: %.3f ms\n), elapsed_msec);
+ 
  		/*
  		 * Set up cancellation of 'watch' via SIGINT.  We redo this each time
  		 * through the loop since it's conceivable something inside PSQLexec
*** a/src/bin/psql/common.c
--- b/src/bin/psql/common.c
***
*** 439,445  AcceptResult(const PGresult *result)
--- 439,459 
  PGresult *
  PSQLexec(const char *query, bool start_xact)
  {
+ 	return PSQLexecInternal(query, start_xact, NULL);
+ }
+ 
+ /*
+  * Send backdoor queries.
+  *
+  * Measure how long the given query takes if elapsed_msec is not NULL and
+  * \timing is enabled.
+  */
+ PGresult *
+ PSQLexecInternal(const char *query, bool start_xact, double *elapsed_msec)
+ {
  	PGresult   *res;
+ 	instr_time	before;
+ 	instr_time	after;
  
  	if (!pset.db)
  	{
***
*** 483,488  PSQLexec(const char *query, bool start_xact)
--- 497,505 
  		PQclear(res);
  	}
  
+ 	if (elapsed_msec != NULL  pset.timing)
+ 		INSTR_TIME_SET_CURRENT(before);
+ 
  	res = PQexec(pset.db, query);
  
  	ResetCancelConn();
***
*** 493,498  PSQLexec(const char *query, bool start_xact)
--- 510,522 
  		res = NULL;
  	}
  
+ 	if (elapsed_msec != NULL  pset.timing)
+ 	{
+ 		INSTR_TIME_SET_CURRENT(after);
+ 		INSTR_TIME_SUBTRACT(after, before);
+ 		*elapsed_msec = INSTR_TIME_GET_MILLISEC(after);
+ 	}
+ 
  	return res;
  }
  
*** a/src/bin/psql/common.h
--- b/src/bin/psql/common.h
***
*** 37,42  extern void SetCancelConn(void);
--- 37,44 
  extern void ResetCancelConn(void);
  
  extern PGresult *PSQLexec(const char *query, bool start_xact);
+ extern PGresult *PSQLexecInternal(const char *query, bool start_xact,
+ double *elapsed_msec);
  
  extern bool SendQuery(const char *query);
  

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Immediate standby promotion

2014-08-14 Thread Tom Lane
Fujii Masao masao.fu...@gmail.com writes:
 I'd like to propose to add new option --immediate to pg_ctl promote.
 When this option is set, recovery ignores any WAL data which have not
 been replayed yet and exits immediately. Patch attached.

 This promotion is faster than normal one but can cause data loss.

TBH, I cannot imagine a situation where that would be a sane thing to do.
If you have WAL, why would you not replay what you have?  The purpose
of a database is to preserve your data, not randomly throw it away.

 Also imagine the case
 where, while recovery is being delayed (by a time-delayed standby
 which was introduced in 9.4) or paused (by pg_xlog_replay_pause),
 you find that subsequent WAL data can cause a disaster to happen
 (for example, WAL data indicating an unexpected deletion of
 important database). In this case, this immediate promotion can be
 used to ignore such problematic WAL data.

That example does not demonstrate that a patch like this is useful.
What you'd presumably want is a way to stop replay at a defined place
(comparable to the PITR facilities).  Not to just abandon the WAL stream
at whatever point replay has reached.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tomas Vondra
On 14 Srpen 2014, 9:22, Jeff Davis wrote:
 I think the hash-join like approach is reasonable, but I also think
 you're going to run into a lot of challenges that make it more complex
 for HashAgg. For instance, let's say you have the query:

   SELECT x, array_agg(y) FROM foo GROUP BY x;

 Say the transition state is an array (for the sake of simplicity), so
 the hash table has something like:

   1000 = {7,   8,  9}
   1001 = {12, 13, 14}

 You run out of memory and need to split the hash table, so you scan the
 hash table and find that group 1001 needs to be written to disk. So you
 serialize the key and array and write them out.

 Then the next tuple you get is (1001, 19). What do you do? Create a new
 group 1001 = {19} (how do you combine it later with the first one)? Or
 try to fetch the existing group 1001 from disk and advance it (horrible
 random I/O)?

No, that's not how it works. The batching algorithm works with a hash of
the group. For example let's suppose you do this:

   batchno = hash % nbatches;

which essentially keeps the last few bits of the hash. 0 bits for
nbatches=1, 1 bit for nbatches=2, 2 bits for nbatches=4 etc.

So let's say we have 2 batches, and we're working on the first batch.
That means we're using 1 bit:

batchno = hash % 2;

and for the first batch we're keeping only groups with batchno=0. So
only groups with 0 as the last bit are in batchno==0.

When running out of memory, you simply do

nbatches *= 2

and start considering one more bit from the hash. So if you had this
before:

group_a = batchno=0 = {7,   8,  9}
group_b = batchno=0 = {12, 13, 14}
group_c = batchno=0 = {23,  1, 45}
group_d = batchno=0 = {77, 37, 54}

(where batchno is a bit string), after doubling the number of batches
you get something like this:

group_a = batchno=10 = {7,   8,  9}
group_b = batchno=00 = {12, 13, 14}
group_c = batchno=00 = {23,  1, 45}
group_d = batchno=10 = {77, 37, 54}

So you have only two possible batchno values here, depending on the new
most-significant bit - either you got 0 (which means it's still in the
current batch) or 1 (and you need to move it to the temp file of the
new batch).

Then, when you get a new tuple, you get it's hash and do a simple check
of the last few bits - effectively computing batchno just like before

   batchno = hash % nbatches;

Either it belongs to the current batch (and either it's in the hash
table, or you add it there), or it's not - in that case write it to a
temp file.

It gets a bit more complex when you increase the number of batches
repeatedly (effectively you need to do the check/move when reading the
batches).

For sure, it's not for free - it may write to quite a few files. Is it
more expensive than what you propose? I'm not sure about that. With
your batching scheme, you'll end up with lower number of large batches,
and you'll need to read and split them, possibly repeatedly. The
batching scheme from hashjoin minimizes this.

IMHO the only way to find out is to some actual tests.

 On Wed, 2014-08-13 at 12:31 +0200, Tomas Vondra wrote:
 My understanding of the batching algorithm (and I may be wrong on this
 one) is that once you choose the number of batches, it's pretty much
 fixed. Is that the case?

 It's only fixed for that one work item (iteration). A different K can
 be selected if memory is exhausted again. But you're right: this is a
 little less flexible than HashJoin.

 But what will happen in case of significant cardinality underestimate?
 I.e. what will happen if you decide to use 16 batches, and then find
 out 256 would be more appropriate? I believe you'll end up with batches
 16x the size you'd want, most likely exceeding work_mem.

 Yes, except that work_mem would never be exceeded. If the partitions are
 16X work_mem, then each would be added as another work_item, and
 hopefully it would choose better the next time.

Only for aggregates with fixed-length state. For aggregates with growing
serialize/deserialize, the states may eventually exceeding work_mem.

  One thing I like about my simple approach is that it returns a good
  number of groups after each pass, and then those are completely
 finished
  (returned to the operator above, even). That's impossible with
 HashJoin
  because the hashing all needs to be done before the probe phase
 begins.

 The hash-join approach returns ~1/N groups after each pass, so I fail to
 see how this is better?

 You can't return any tuples until you begin the probe phase, and that
 doesn't happen until you've hashed the entire inner side (which involves
 splitting and other work). With my patch, it will return some tuples
 after the first scan. Perhaps I'm splitting hairs here, but the idea of
 finalizing some groups as early as possible seems appealing.

I fail to see how this is different from your approach? How can you
output any tuples before processing the whole inner relation?

After the first scan, the hash-join approach is pretty much 

Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Heikki Linnakangas hlinnakan...@vmware.com writes:
 For comparison, here's a patch that implements the scheme that Alexander 
 Korotkov suggested, where we store an offset every 8th element, and a 
 length in the others. It compresses Larry's example to 525 bytes. 
 Increasing the stride from 8 to 16 entries, it compresses to 461 bytes.

 A nice thing about this patch is that it's on-disk compatible with the 
 current format, hence initdb is not required.

TBH, I think that's about the only nice thing about it :-(.  It's
conceptually a mess.  And while I agree that this way avoids creating
a big-O performance issue for large arrays/objects, I think the micro
performance is probably going to be not so good.  The existing code is
based on the assumption that JBE_OFF() and JBE_LEN() are negligibly cheap;
but with a solution like this, it's guaranteed that one or the other is
going to be not-so-cheap.

I think if we're going to do anything to the representation at all,
we need to refactor the calling code; at least fixing the JsonbIterator
logic so that it tracks the current data offset rather than expecting to
able to compute it at no cost.

The difficulty in arguing about this is that unless we have an agreed-on
performance benchmark test, it's going to be a matter of unsupported
opinions whether one solution is faster than another.  Have we got
anything that stresses key lookup and/or array indexing?

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Jeff Davis
On Thu, 2014-08-14 at 10:06 -0400, Tom Lane wrote:
 If you're following the HashJoin model, then what you do is the same thing
 it does: you write the input tuple back out to the pending batch file for
 the hash partition that now contains key 1001, whence it will be processed
 when you get to that partition.  I don't see that there's any special case
 here.

HashJoin only deals with tuples. With HashAgg, you have to deal with a
mix of tuples and partially-computed aggregate state values. Not
impossible, but it is a little more awkward than HashJoin.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Atri Sharma
On Thursday, August 14, 2014, Jeff Davis pg...@j-davis.com wrote:

 On Thu, 2014-08-14 at 10:06 -0400, Tom Lane wrote:
  If you're following the HashJoin model, then what you do is the same
 thing
  it does: you write the input tuple back out to the pending batch file for
  the hash partition that now contains key 1001, whence it will be
 processed
  when you get to that partition.  I don't see that there's any special
 case
  here.

 HashJoin only deals with tuples. With HashAgg, you have to deal with a
 mix of tuples and partially-computed aggregate state values. Not
 impossible, but it is a little more awkward than HashJoin.


+1

Not to mention future cases if we start maintaining multiple state
values,in regarded to grouping sets.

Regards,

Atri


-- 
Regards,

Atri
*l'apprenant*


Re: [HACKERS] Proposal to add a QNX 6.5 port to PostgreSQL

2014-08-14 Thread Baker, Keith [OCDUS Non-JJ]
Tom and Robert,

I tried a combination of PIPE lock and file lock (fcntl) as Tom had suggested.
Attached experimental patch has this logic...

Postmaster :
- get exclusive fcntl lock (to guard against race condition in PIPE-based lock)
- check PIPE for any existing readers
- open PIPE for read

All other backends:
- get shared fcnlt lock
- open PIPE for read

Your feedback is appreciated.
Thanks.

-Keith Baker


 -Original Message-
 From: pgsql-hackers-ow...@postgresql.org [mailto:pgsql-hackers-
 ow...@postgresql.org] On Behalf Of Tom Lane
 Sent: Wednesday, July 30, 2014 11:02 AM
 To: Robert Haas
 Cc: Baker, Keith [OCDUS Non-JJ]; pgsql-hackers@postgresql.org
 Subject: Re: [HACKERS] Proposal to add a QNX 6.5 port to PostgreSQL
 
 Robert Haas robertmh...@gmail.com writes:
  On Tue, Jul 29, 2014 at 7:06 PM, Tom Lane t...@sss.pgh.pa.us wrote:
  Hm.  That particular protocol is broken: two postmasters doing it at
  the same time would both pass (because neither has it open for read
  at the instant where they try to write).  But we could possibly frob
  the idea until it works.  Bigger question is how portable is this behavior?
  I see named pipes (fifos) in SUS v2, which is our usual baseline
  assumption about what's portable across Unixen, so maybe it would
 work.
  But does NFS support named pipes?
 
  Looks iffy, on a quick search.  Sigh.
 
 I poked around, and it seems like a lot of the people who think it's flaky are
 imagining that they should be able to use a named pipe on an NFS server to
 pass data between two different machines.  That doesn't work, but it's not
 what we need, either.  For communication between processes on the same
 server, all that's needed is that the filesystem entry looks like a pipe to 
 the
 local kernel --- and that's been required NFS functionality since RFC1813 (v3,
 in 1995).
 
 So it seems like we could possibly go this route, assuming we can think of a
 variant of your proposal that's race-condition-free.  A disadvantage
 compared to a true file lock is that it would not protect against people 
 trying
 to start postmasters from two different NFS client machines --- but we don't
 have protection against that now.  (Maybe we could do this *and* do a
 regular file lock to offer some protection against that case, even if it's not
 bulletproof?)
 
   regards, tom lane
 
 
 --
 Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make
 changes to your subscription:
 http://www.postgresql.org/mailpref/pgsql-hackers


locking_20140814.patch
Description: locking_20140814.patch

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Bruce Momjian
On Wed, Aug 13, 2014 at 09:01:43PM -0400, Tom Lane wrote:
 I wrote:
  That's a fair question.  I did a very very simple hack to replace the item
  offsets with item lengths -- turns out that that mostly requires removing
  some code that changes lengths to offsets ;-).  I then loaded up Larry's
  example of a noncompressible JSON value, and compared pg_column_size()
  which is just about the right thing here since it reports datum size after
  compression.  Remembering that the textual representation is 12353 bytes:
 
  json:   382 bytes
  jsonb, using offsets:   12593 bytes
  jsonb, using lengths:   406 bytes
 
 Oh, one more result: if I leave the representation alone, but change
 the compression parameters to set first_success_by to INT_MAX, this
 value takes up 1397 bytes.  So that's better, but still more than a
 3X penalty compared to using lengths.  (Admittedly, this test value
 probably is an outlier compared to normal practice, since it's a hundred
 or so repetitions of the same two strings.)

Uh, can we get compression for actual documents, rather than duplicate
strings?

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + Everyone has their own god. +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tom Lane
Jeff Davis pg...@j-davis.com writes:
 HashJoin only deals with tuples. With HashAgg, you have to deal with a
 mix of tuples and partially-computed aggregate state values. Not
 impossible, but it is a little more awkward than HashJoin.

Not sure that I follow your point.  You're going to have to deal with that
no matter what, no?

I guess in principle you could avoid the need to dump agg state to disk.
What you'd have to do is write out tuples to temp files even when you
think you've processed them entirely, so that if you later realize you
need to split the current batch, you can recompute the states of the
postponed aggregates from scratch (ie from the input tuples) when you get
around to processing the batch they got moved to.  This would avoid
confronting the how-to-dump-agg-state problem, but it seems to have little
else to recommend it.  Even if splitting a batch is a rare occurrence,
the killer objection here is that even a totally in-memory HashAgg would
have to write all its input to a temp file, on the small chance that it
would exceed work_mem and need to switch to batching.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

2014-08-14 Thread Robert Haas
On Thu, Aug 7, 2014 at 4:19 PM, Peter Geoghegan p...@heroku.com wrote:
 On Thu, Aug 7, 2014 at 12:17 PM, Robert Haas robertmh...@gmail.com wrote:
 Gah.  Hit send to soon.  Also, as much as I'd prefer to avoid
 relitigating the absolutely stupid debate about how to expand the
 buffers, this is no good:

 +   tss-buflen1 = Max(len1 + 1, tss-buflen1 * 2);

 If the first expansion is for a string 512MB and the second string is
 longer than the first, this will exceed MaxAllocSize and error out.

 Fair point. I think this problem is already present in a few existing
 places, but it shouldn't be. I suggest this remediation:

 +   tss-buflen1 = Max(len1 + 1, Min(tss-buflen1 * 2, (int) 
 MaxAllocSize));

 I too would very much prefer to not repeat that debate.  :-)

Committed that way.  As the patch is by and large the same as what I
submitted for this originally, I credited myself as first author and
you as second author.  I hope that seems fair.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Reporting the commit LSN at commit time

2014-08-14 Thread Robert Haas
On Sat, Aug 9, 2014 at 12:54 PM, Andres Freund and...@2ndquadrant.com wrote:
 On 2014-08-07 21:02:54 -0400, Tom Lane wrote:
 Craig Ringer cr...@2ndquadrant.com writes:
  On 08/08/2014 03:54 AM, Tom Lane wrote:
  FWIW, I think it's a seriously bad idea to expose LSNs in the protocol
  at all.   What happens five years from now when we switch to some other
  implementation that doesn't have LSNs?

  Everyone who's relying on them already via pg_stat_replication, etc, 
  breaks.
  They're _already_ exposed to users. That ship has sailed.

 They're exposed to replication tools, yeah, but embedding them in the
 wire protocol would be moving the goalposts a long way past that.  As an
 example of something that doubtless seemed like a good idea at the time,
 consider the business about how an INSERT command completion tag includes
 the OID of the inserted row.  We're stuck with that obsolete idea
 *forever* because it's embedded in the protocol for all clients.

 I don't think we really need to embed it at that level. And it doesn't
 have to be always on - only clients that ask for it need to get the
 answer. Something like COMMIT WITH (report_commit_lsn ON); or similar
 might do the trick?

And what does that actually do?  Send back a result-set, or a new
protocol message?

I don't have a very clear idea whether this is a good idea in any form
because I can't quite imagine how this is going to be used by the
client without adding an unwarranted amount of complexity there.
However, based on my experiences at EnterpriseDB, I would be extremely
wary of extending the wire protocol.  As soon as we do that, it
requires updates to a really phenomenal amount of other software.
Software using libpq may be more or less able to ignore the
difference, as long as they have a new-enough version of libpq (which
is a significant proviso).  But any driver that has its own
implementation of the wire protocol (and I think there is at least one
and maybe several important ones that do) needs updating, and anything
that acts as middleware (pgpool, pgbouncer) does too.  And it's not
just a matter of the maintainers making the appropriate changes
(though that does need to happen); it's also about everyone who is
using the new server version getting new versions of that other
software also.

So, even accepting for the moment the premise that the basic idea here
is good, I think the benefits would have to be monumental to convince
me that a protocol change is a good idea.  If we do anything like
that, we'll be hearing about the downstream damage for years.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Bruce Momjian br...@momjian.us writes:
 Uh, can we get compression for actual documents, rather than duplicate
 strings?

[ shrug... ]  What's your proposed set of actual documents?
I don't think we have any corpus of JSON docs that are all large
enough to need compression.

This gets back to the problem of what test case are we going to consider
while debating what solution to adopt.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tomas Vondra
On 14 Srpen 2014, 18:12, Tom Lane wrote:
 Jeff Davis pg...@j-davis.com writes:
 HashJoin only deals with tuples. With HashAgg, you have to deal with a
 mix of tuples and partially-computed aggregate state values. Not
 impossible, but it is a little more awkward than HashJoin.

 Not sure that I follow your point.  You're going to have to deal with that
 no matter what, no?

That is not how the patch work. Once the memory consumption hits work_mem,
it keeps the already existing groups in memory, and only stops creating
new groups. For each tuple, hashagg does a lookup - if the group is
already in memory, it performs the transition, otherwise it writes the
tuple to disk (and does some batching, but that's mostly irrelevant here).

This way it's not necessary to dump the partially-computed states, and for
fixed-size states it actually limits the amount of consumed memory. For
variable-length aggregates (array_agg et.al.) not so much.

 I guess in principle you could avoid the need to dump agg state to disk.
 What you'd have to do is write out tuples to temp files even when you
 think you've processed them entirely, so that if you later realize you
 need to split the current batch, you can recompute the states of the
 postponed aggregates from scratch (ie from the input tuples) when you get
 around to processing the batch they got moved to.  This would avoid
 confronting the how-to-dump-agg-state problem, but it seems to have little
 else to recommend it.  Even if splitting a batch is a rare occurrence,
 the killer objection here is that even a totally in-memory HashAgg would
 have to write all its input to a temp file, on the small chance that it
 would exceed work_mem and need to switch to batching.

Yeah, I think putting this burden on each hashagg is not a good thing.

I was thinking about is an automatic fall-back - try to do an in-memory
hash-agg. When you hit work_mem limit, see how far we are (have we scanned
10% or 90% of tuples?), and decide whether to restart with batching.

But I think there's no single solution, fixing all the possible cases. I
think the patch proposed here is a solid starting point, that may be
improved and extended by further patches. Eventually, what I think might
work is this combination of approaches:

1) fixed-size states and states with serialize/deserialize methods

   = hashjoin-like batching (i.e. dumping both tuples and states)

2) variable-size states without serialize/deserialize

   = Jeff's approach (keep states in memory, dump tuples)
   = possibly with the rescan fall-back, for quickly growing states


Tomas



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tomas Vondra
On 14 Srpen 2014, 18:02, Atri Sharma wrote:
 On Thursday, August 14, 2014, Jeff Davis pg...@j-davis.com wrote:

 On Thu, 2014-08-14 at 10:06 -0400, Tom Lane wrote:
  If you're following the HashJoin model, then what you do is the same
 thing
  it does: you write the input tuple back out to the pending batch file
 for
  the hash partition that now contains key 1001, whence it will be
 processed
  when you get to that partition.  I don't see that there's any special
 case
  here.

 HashJoin only deals with tuples. With HashAgg, you have to deal with a
 mix of tuples and partially-computed aggregate state values. Not
 impossible, but it is a little more awkward than HashJoin.


 +1

 Not to mention future cases if we start maintaining multiple state
 values,in regarded to grouping sets.

So what would you do for aggregates where the state is growing quickly?
Say, things like median() or array_agg()?

I think that we can't do that for all aggregates does not imply we must
not do that at all.

There will always be aggregates not implementing dumping state for various
reasons, and in those cases the proposed approach is certainly a great
improvement. I like it, and I hope it will get committed.

But maybe for aggregates supporting serialize/deserialize of the state
(including all aggregates using known types, not just fixed-size types) a
hashjoin-like batching would be better? I can name a few custom aggregates
that'd benefit tremendously from this.

Just to be clear - this is certainly non-trivial to implement, and I'm not
trying to force anyone (e.g. Jeff) to implement the ideas I proposed. I'm
ready to spend time on reviewing the current patch, implement the approach
I proposed and compare the behaviour.

Kudos to Jeff for working on this.

Tomas



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Bruce Momjian
On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:
 Bruce Momjian br...@momjian.us writes:
  Uh, can we get compression for actual documents, rather than duplicate
  strings?
 
 [ shrug... ]  What's your proposed set of actual documents?
 I don't think we have any corpus of JSON docs that are all large
 enough to need compression.
 
 This gets back to the problem of what test case are we going to consider
 while debating what solution to adopt.

Uh, we just one need one 12k JSON document from somewhere.  Clearly this
is something we can easily get.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + Everyone has their own god. +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Better memory accounting, towards memory-bounded HashAgg

2014-08-14 Thread Robert Haas
On Fri, Aug 8, 2014 at 4:16 AM, Jeff Davis pg...@j-davis.com wrote:
 I wasn't able to reproduce your results on my machine. At -s 300, with
 maintenance_work_mem set high enough to do internal sort, it took about
 40s and I heard some disk activity, so I didn't think it was a valid
 result. I went down to -s 150, and it took around 5.3s on both master
 and memory-accounting.

 Either way, it's better to be conservative. Attached is a version of the
 patch with opt-in memory usage tracking. Child contexts inherit the
 setting from their parent.

I repeated my tests with your v3 patch.  Here are the new results:

master, as of commit a4287a689d10bd4863e3dfbf9c4f232feeca0cdd:
2014-08-14 16:45:24 UTC [2940] LOG:  internal sort ended, 1723933 KB
used: CPU 2.44s/11.52u sec elapsed 16.68 sec
2014-08-14 16:46:34 UTC [2960] LOG:  internal sort ended, 1723933 KB
used: CPU 2.63s/11.65u sec elapsed 16.94 sec
2014-08-14 16:47:26 UTC [2979] LOG:  internal sort ended, 1723933 KB
used: CPU 2.63s/11.48u sec elapsed 16.85 sec

memory-accounting-v3, on top of the aforementioned master commit:
2014-08-14 16:46:05 UTC [2950] LOG:  internal sort ended, 1723933 KB
used: CPU 2.52s/12.16u sec elapsed 17.36 sec
2014-08-14 16:47:00 UTC [2969] LOG:  internal sort ended, 1723933 KB
used: CPU 2.52s/11.90u sec elapsed 17.11 sec
2014-08-14 16:47:51 UTC [2988] LOG:  internal sort ended, 1723933 KB
used: CPU 2.52s/11.98u sec elapsed 17.31 sec

It appears to me that the performance characteristics for this version
are not significantly different from version 1.  I have not looked at
the code.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tom Lane
Tomas Vondra t...@fuzzy.cz writes:
 On 14 Srpen 2014, 18:12, Tom Lane wrote:
 Not sure that I follow your point.  You're going to have to deal with that
 no matter what, no?

 That is not how the patch work. Once the memory consumption hits work_mem,
 it keeps the already existing groups in memory, and only stops creating
 new groups.

Oh?  So if we have aggregates like array_agg whose memory footprint
increases over time, the patch completely fails to avoid bloat?

I might think a patch with such a limitation was useful, if it weren't
for the fact that aggregates of that nature are a large part of the
cases where the planner misestimates the table size in the first place.
Any complication that we add to nodeAgg should be directed towards
dealing with cases that the planner is likely to get wrong.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Jeff Davis
On Thu, 2014-08-14 at 16:17 +0200, Tomas Vondra wrote:
 Either it belongs to the current batch (and either it's in the hash
 table, or you add it there), or it's not - in that case write it to a
 temp file.

I think the part you left out is that you need two files per batch: one
for the dumped-out partially-computed state values, and one for the
tuples.

In other words, you haven't really discussed the step where you reunite
the tuples with that partially-computed state.

 For sure, it's not for free - it may write to quite a few files. Is it
 more expensive than what you propose? I'm not sure about that. With
 your batching scheme, you'll end up with lower number of large batches,
 and you'll need to read and split them, possibly repeatedly. The
 batching scheme from hashjoin minimizes this.

My approach only has fewer batches if it elects to have fewer batches,
which might happen for two reasons:
 1. A cardinality misestimate. This certainly could happen, but we do
have useful numbers to work from (we know the number of tuples and
distincts that we've read so far), so it's far from a blind guess. 
 2. We're concerned about the random I/O from way too many partitions.

 I fail to see how this is different from your approach? How can you
 output any tuples before processing the whole inner relation?

Right, the only thing I avoid is scanning the hash table and dumping out
the groups.

This isn't a major distinction, more like my approach does a little
less work before returning tuples, and I'm not even sure I can defend
that, so I'll retract this point.

 Your approach is to do multi-level batching, and I was thinking whether
 it'd be possible to use the same approach (single level). But in
 retrospect it probably does not make much sense, because the multi-level
 batching is one of the points of the proposed approach.

Now that I think about it, many of the points we discussed could
actually work with either approach:
  * In my approach, if I need more partitions, I could create more in
much the same way as HashJoin to keep it single-level (as you suggest
above).
  * In your approach, if there are too many partitions, you could avoid
random I/O by intentionally putting tuples from multiple partitions in a
single file and moving them while reading.
  * If given a way to write out the partially-computed states, I could
evict some groups from the hash table to keep an array_agg() bounded.

Our approaches only differ on one fundamental trade-off that I see:
  (A) My approach requires a hash lookup of an already-computed hash for
every incoming tuple, not only the ones going into the hash table.
  (B) Your approach requires scanning the hash table and dumping out the
states every time the hash table fills up, which therefore requires a
way to dump out the partial states.

You could probably win the argument by pointing out that (A) is O(N) and
(B) is O(log2(N)). But I suspect that cost (A) is very low.

Unfortunately, it would take some effort to test your approach because
we'd actually need a way to write out the partially-computed state, and
the algorithm itself seems a little more complex. So I'm not really sure
how to proceed.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Function to know last log write timestamp

2014-08-14 Thread Robert Haas
On Mon, Aug 11, 2014 at 3:20 AM, Fujii Masao masao.fu...@gmail.com wrote:
 There is no extra spinlock.

The version I reviewed had one; that's what I was objecting to.

Might need to add some pg_read_barrier() and pg_write_barrier() calls,
since we have those now.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] replication commands and index terms

2014-08-14 Thread Fujii Masao
Hi,

I found that only CREATE_REPLICATION_SLOT of replication commands
is exposed as an index term in the document. Is this intentional?
If not, for the consistency, I think that we should either expose other
replication commands also as index terms, or remove
CREATE_REPLICATION_SLOT from the index. Thought?

Since I sometimes try to search the replication command in the index,
I'd feel inclined to expose all those commands as index terms...

Regards,

-- 
Fujii Masao


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Merlin Moncure
On Thu, Aug 14, 2014 at 11:52 AM, Bruce Momjian br...@momjian.us wrote:
 On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:
 Bruce Momjian br...@momjian.us writes:
  Uh, can we get compression for actual documents, rather than duplicate
  strings?

 [ shrug... ]  What's your proposed set of actual documents?
 I don't think we have any corpus of JSON docs that are all large
 enough to need compression.

 This gets back to the problem of what test case are we going to consider
 while debating what solution to adopt.

 Uh, we just one need one 12k JSON document from somewhere.  Clearly this
 is something we can easily get.

it's trivial to make a large json[b] document:
select length(to_json(array(select row(a.*) from pg_attribute a))::TEXT);

select


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

2014-08-14 Thread Peter Geoghegan
On Thu, Aug 14, 2014 at 9:13 AM, Robert Haas robertmh...@gmail.com wrote:
 Committed that way.  As the patch is by and large the same as what I
 submitted for this originally, I credited myself as first author and
 you as second author.  I hope that seems fair.


I think that's more than fair. Thanks!

-- 
Peter Geoghegan


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Function to know last log write timestamp

2014-08-14 Thread Fujii Masao
On Fri, Aug 15, 2014 at 1:55 AM, Robert Haas robertmh...@gmail.com wrote:
 On Mon, Aug 11, 2014 at 3:20 AM, Fujii Masao masao.fu...@gmail.com wrote:
 There is no extra spinlock.

 The version I reviewed had one; that's what I was objecting to.

Sorry for confusing you. I posted the latest patch to other thread.
This version doesn't use any spinlock.

http://www.postgresql.org/message-id/CAHGQGwEwuh5CC3ib6wd0fxs9LAWme=ko09s4moxnynafn7n...@mail.gmail.com

 Might need to add some pg_read_barrier() and pg_write_barrier() calls,
 since we have those now.

Yep, memory barries might be needed as follows.

* Set the commit timestamp to shared memory.

shmem-counter++;
pg_write_barrier();
shmem-timestamp = my_timestamp;
pg_write_barrier();
shmem-count++;

* Read the commit timestamp from shared memory

my_count = shmem-counter;
pg_read_barrier();
my_timestamp = shmem-timestamp;
pg_read_barrier();
my_count = shmem-counter;

Is this way to use memory barriers right?

Regards,

-- 
Fujii Masao


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Bruce Momjian br...@momjian.us writes:
 On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:
 This gets back to the problem of what test case are we going to consider
 while debating what solution to adopt.

 Uh, we just one need one 12k JSON document from somewhere.  Clearly this
 is something we can easily get.

I would put little faith in a single document as being representative.

To try to get some statistics about a real-world case, I looked at the
delicio.us dataset that someone posted awhile back (1252973 JSON docs).
These have a minimum length (in text representation) of 604 bytes and
a maximum length of 5949 bytes, which means that they aren't going to
tell us all that much about large JSON docs, but this is better than
no data at all.

Since documents of only a couple hundred bytes aren't going to be subject
to compression, I made a table of four columns each containing the same
JSON data, so that each row would be long enough to force the toast logic
to try to do something.  (Note that none of these documents are anywhere
near big enough to hit the refuses-to-compress problem.)  Given that,
I get the following statistics for pg_column_size():

min max avg

JSON (text) representation  382 1155526.5

HEAD's JSONB representation 493 1485695.1

all-lengths representation  440 1257615.3

So IOW, on this dataset the existing JSONB representation creates about
32% bloat compared to just storing the (compressed) user-visible text,
and switching to all-lengths would about halve that penalty.

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done.  I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place.  Do we have other JSON
corpuses to look at?

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Peter Geoghegan
On Thu, Aug 14, 2014 at 10:57 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Maybe this is telling us it's not worth changing the representation,
 and we should just go do something about the first_success_by threshold
 and be done.  I'm hesitant to draw such conclusions on the basis of a
 single use-case though, especially one that doesn't really have that
 much use for compression in the first place.  Do we have other JSON
 corpuses to look at?


Yes. Pavel posted some representative JSON data a while back:
http://pgsql.cz/data/data.dump.gz (it's a plain dump)

-- 
Peter Geoghegan


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Atri Sharma
On Thu, Aug 14, 2014 at 10:21 PM, Tomas Vondra t...@fuzzy.cz wrote:

 On 14 Srpen 2014, 18:02, Atri Sharma wrote:
  On Thursday, August 14, 2014, Jeff Davis pg...@j-davis.com wrote:
 
  On Thu, 2014-08-14 at 10:06 -0400, Tom Lane wrote:
   If you're following the HashJoin model, then what you do is the same
  thing
   it does: you write the input tuple back out to the pending batch file
  for
   the hash partition that now contains key 1001, whence it will be
  processed
   when you get to that partition.  I don't see that there's any special
  case
   here.
 
  HashJoin only deals with tuples. With HashAgg, you have to deal with a
  mix of tuples and partially-computed aggregate state values. Not
  impossible, but it is a little more awkward than HashJoin.
 
 
  +1
 
  Not to mention future cases if we start maintaining multiple state
  values,in regarded to grouping sets.

 So what would you do for aggregates where the state is growing quickly?
 Say, things like median() or array_agg()?

 I think that we can't do that for all aggregates does not imply we must
 not do that at all.

 There will always be aggregates not implementing dumping state for various
 reasons, and in those cases the proposed approach is certainly a great
 improvement. I like it, and I hope it will get committed.

 But maybe for aggregates supporting serialize/deserialize of the state
 (including all aggregates using known types, not just fixed-size types) a
 hashjoin-like batching would be better? I can name a few custom aggregates
 that'd benefit tremendously from this.


Yeah, could work, but is it worth adding additional paths (assuming this
patch gets committed) for some aggregates? I think we should do a further
analysis on the use case.


 Just to be clear - this is certainly non-trivial to implement, and I'm not
 trying to force anyone (e.g. Jeff) to implement the ideas I proposed. I'm
 ready to spend time on reviewing the current patch, implement the approach
 I proposed and compare the behaviour.


Totally agreed. It would be a different approach, albeit as you said, the
approach can be done off the current patch.


 Kudos to Jeff for working on this.

 Agreed :)






-- 
Regards,

Atri
*l'apprenant*


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Bruce Momjian
On Thu, Aug 14, 2014 at 01:57:14PM -0400, Tom Lane wrote:
 Maybe this is telling us it's not worth changing the representation,
 and we should just go do something about the first_success_by threshold
 and be done.  I'm hesitant to draw such conclusions on the basis of a
 single use-case though, especially one that doesn't really have that
 much use for compression in the first place.  Do we have other JSON
 corpuses to look at?

Yes, that is what I was expecting --- once the whitespace and syntax
sugar is gone in JSONB, I was unclear how much compression would help.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + Everyone has their own god. +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Supporting Windows SChannel as OpenSSL replacement

2014-08-14 Thread Robert Haas
On Tue, Aug 12, 2014 at 1:52 PM, Heikki Linnakangas
hlinnakan...@vmware.com wrote:
 This isn't a showstopper, but needs some thought. As the patch stands, it
 uses a single key container called PostgreSQL server key container, and
 makes no attempt to delete the keys after they're no longer used. That
 works, but it leaves the key lying on the system.

What about using something like 'PostgreSQL ' || system_identifier?

Would it make sense to have pg_ctl unregister delete the key
container, or do we need a separate facility for that?

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Function to know last log write timestamp

2014-08-14 Thread Robert Haas
On Thu, Aug 14, 2014 at 1:51 PM, Fujii Masao masao.fu...@gmail.com wrote:
 On Fri, Aug 15, 2014 at 1:55 AM, Robert Haas robertmh...@gmail.com wrote:
 On Mon, Aug 11, 2014 at 3:20 AM, Fujii Masao masao.fu...@gmail.com wrote:
 There is no extra spinlock.

 The version I reviewed had one; that's what I was objecting to.

 Sorry for confusing you. I posted the latest patch to other thread.
 This version doesn't use any spinlock.

 http://www.postgresql.org/message-id/CAHGQGwEwuh5CC3ib6wd0fxs9LAWme=ko09s4moxnynafn7n...@mail.gmail.com

 Might need to add some pg_read_barrier() and pg_write_barrier() calls,
 since we have those now.

 Yep, memory barries might be needed as follows.

 * Set the commit timestamp to shared memory.

 shmem-counter++;
 pg_write_barrier();
 shmem-timestamp = my_timestamp;
 pg_write_barrier();
 shmem-count++;

 * Read the commit timestamp from shared memory

 my_count = shmem-counter;
 pg_read_barrier();
 my_timestamp = shmem-timestamp;
 pg_read_barrier();
 my_count = shmem-counter;

 Is this way to use memory barriers right?

That's about the idea. However, what you've got there is actually
unsafe, because shmem-counter++ is not an atomic operation.  It reads
the counter (possibly even as two separate 4-byte loads if the counter
is an 8-byte value), increments it inside the CPU, and then writes the
resulting value back to memory.  If two backends do this concurrently,
one of the updates might be lost.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Josh Berkus
On 08/14/2014 11:13 AM, Bruce Momjian wrote:
 On Thu, Aug 14, 2014 at 01:57:14PM -0400, Tom Lane wrote:
 Maybe this is telling us it's not worth changing the representation,
 and we should just go do something about the first_success_by threshold
 and be done.  I'm hesitant to draw such conclusions on the basis of a
 single use-case though, especially one that doesn't really have that
 much use for compression in the first place.  Do we have other JSON
 corpuses to look at?
 
 Yes, that is what I was expecting --- once the whitespace and syntax
 sugar is gone in JSONB, I was unclear how much compression would help.

I thought the destruction case was when we have enough top-level keys
that the offsets are more than 1K total, though, yes?

So we need to test that set ...

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Function to know last log write timestamp

2014-08-14 Thread Andres Freund
On 2014-08-14 14:19:13 -0400, Robert Haas wrote:
 On Thu, Aug 14, 2014 at 1:51 PM, Fujii Masao masao.fu...@gmail.com wrote:
  On Fri, Aug 15, 2014 at 1:55 AM, Robert Haas robertmh...@gmail.com wrote:
  On Mon, Aug 11, 2014 at 3:20 AM, Fujii Masao masao.fu...@gmail.com wrote:
  There is no extra spinlock.
 
  The version I reviewed had one; that's what I was objecting to.
 
  Sorry for confusing you. I posted the latest patch to other thread.
  This version doesn't use any spinlock.
 
  http://www.postgresql.org/message-id/CAHGQGwEwuh5CC3ib6wd0fxs9LAWme=ko09s4moxnynafn7n...@mail.gmail.com
 
  Might need to add some pg_read_barrier() and pg_write_barrier() calls,
  since we have those now.
 
  Yep, memory barries might be needed as follows.
 
  * Set the commit timestamp to shared memory.
 
  shmem-counter++;
  pg_write_barrier();
  shmem-timestamp = my_timestamp;
  pg_write_barrier();
  shmem-count++;
 
  * Read the commit timestamp from shared memory
 
  my_count = shmem-counter;
  pg_read_barrier();
  my_timestamp = shmem-timestamp;
  pg_read_barrier();
  my_count = shmem-counter;
 
  Is this way to use memory barriers right?
 
 That's about the idea. However, what you've got there is actually
 unsafe, because shmem-counter++ is not an atomic operation.  It reads
 the counter (possibly even as two separate 4-byte loads if the counter
 is an 8-byte value), increments it inside the CPU, and then writes the
 resulting value back to memory.  If two backends do this concurrently,
 one of the updates might be lost.

All these are only written by one backend, so it should be safe. Note
that that coding pattern, just without memory barriers, is all over
pgstat.c

Greetings,

Andres Freund

-- 
 Andres Freund http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Jeff Davis
On Thu, 2014-08-14 at 12:53 -0400, Tom Lane wrote:
 Oh?  So if we have aggregates like array_agg whose memory footprint
 increases over time, the patch completely fails to avoid bloat?

Yes, in its current form.

 I might think a patch with such a limitation was useful, if it weren't
 for the fact that aggregates of that nature are a large part of the
 cases where the planner misestimates the table size in the first place.
 Any complication that we add to nodeAgg should be directed towards
 dealing with cases that the planner is likely to get wrong.

In my experience, the planner has a lot of difficulty estimating the
cardinality unless it's coming from a base table without any operators
above it (other than maybe a simple predicate). This is probably a lot
more common than array_agg problems, simply because array_agg is
relatively rare compared with GROUP BY in general.

Also, there are also cases where my patch should win against Sort even
when it does go to disk. For instance, high enough cardinality to exceed
work_mem, but also a large enough group size. Sort will have to deal
with all of the tuples before it can group any of them, whereas HashAgg
can group at least some of them along the way.

Consider the skew case where the cardinality is 2M, work_mem fits 1M
groups, and the input consists of the keys 1..199 mixed randomly
inside one billion zeros. (Aside: if the input is non-random, you may
not get the skew value before the hash table fills up, in which case
HashAgg is just as bad as Sort.)

That being said, we can hold out for an array_agg fix if desired. As I
pointed out in another email, my proposal is compatible with the idea of
dumping groups out of the hash table, and does take some steps in that
direction.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Oleg Bartunov
I did quick test on the same bookmarks to test performance of 9.4beta2 and
9.4beta2+patch

The query was the same we used in pgcon presentation:
SELECT count(*) FROM jb WHERE jb @ '{tags:[{term:NYC}]}'::jsonb;

  table size  |   time (ms)
9.4beta2:1374 Mb  | 1160
9.4beta2+patch: 1373 Mb  | 1213


Yes, performance degrades, but not much.  There is also small  win in table
size, but bookmarks are not big, so it's difficult to say about compression.

Oleg



On Thu, Aug 14, 2014 at 9:57 PM, Tom Lane t...@sss.pgh.pa.us wrote:

 Bruce Momjian br...@momjian.us writes:
  On Thu, Aug 14, 2014 at 12:22:46PM -0400, Tom Lane wrote:
  This gets back to the problem of what test case are we going to consider
  while debating what solution to adopt.

  Uh, we just one need one 12k JSON document from somewhere.  Clearly this
  is something we can easily get.

 I would put little faith in a single document as being representative.

 To try to get some statistics about a real-world case, I looked at the
 delicio.us dataset that someone posted awhile back (1252973 JSON docs).
 These have a minimum length (in text representation) of 604 bytes and
 a maximum length of 5949 bytes, which means that they aren't going to
 tell us all that much about large JSON docs, but this is better than
 no data at all.

 Since documents of only a couple hundred bytes aren't going to be subject
 to compression, I made a table of four columns each containing the same
 JSON data, so that each row would be long enough to force the toast logic
 to try to do something.  (Note that none of these documents are anywhere
 near big enough to hit the refuses-to-compress problem.)  Given that,
 I get the following statistics for pg_column_size():

 min max avg

 JSON (text) representation  382 1155526.5

 HEAD's JSONB representation 493 1485695.1

 all-lengths representation  440 1257615.3

 So IOW, on this dataset the existing JSONB representation creates about
 32% bloat compared to just storing the (compressed) user-visible text,
 and switching to all-lengths would about halve that penalty.

 Maybe this is telling us it's not worth changing the representation,
 and we should just go do something about the first_success_by threshold
 and be done.  I'm hesitant to draw such conclusions on the basis of a
 single use-case though, especially one that doesn't really have that
 much use for compression in the first place.  Do we have other JSON
 corpuses to look at?

 regards, tom lane


 --
 Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
 To make changes to your subscription:
 http://www.postgresql.org/mailpref/pgsql-hackers



Re: [HACKERS] Function to know last log write timestamp

2014-08-14 Thread Robert Haas
On Thu, Aug 14, 2014 at 2:21 PM, Andres Freund and...@2ndquadrant.com wrote:
 On 2014-08-14 14:19:13 -0400, Robert Haas wrote:
 On Thu, Aug 14, 2014 at 1:51 PM, Fujii Masao masao.fu...@gmail.com wrote:
  On Fri, Aug 15, 2014 at 1:55 AM, Robert Haas robertmh...@gmail.com wrote:
  On Mon, Aug 11, 2014 at 3:20 AM, Fujii Masao masao.fu...@gmail.com 
  wrote:
  There is no extra spinlock.
 
  The version I reviewed had one; that's what I was objecting to.
 
  Sorry for confusing you. I posted the latest patch to other thread.
  This version doesn't use any spinlock.
 
  http://www.postgresql.org/message-id/CAHGQGwEwuh5CC3ib6wd0fxs9LAWme=ko09s4moxnynafn7n...@mail.gmail.com
 
  Might need to add some pg_read_barrier() and pg_write_barrier() calls,
  since we have those now.
 
  Yep, memory barries might be needed as follows.
 
  * Set the commit timestamp to shared memory.
 
  shmem-counter++;
  pg_write_barrier();
  shmem-timestamp = my_timestamp;
  pg_write_barrier();
  shmem-count++;
 
  * Read the commit timestamp from shared memory
 
  my_count = shmem-counter;
  pg_read_barrier();
  my_timestamp = shmem-timestamp;
  pg_read_barrier();
  my_count = shmem-counter;
 
  Is this way to use memory barriers right?

 That's about the idea. However, what you've got there is actually
 unsafe, because shmem-counter++ is not an atomic operation.  It reads
 the counter (possibly even as two separate 4-byte loads if the counter
 is an 8-byte value), increments it inside the CPU, and then writes the
 resulting value back to memory.  If two backends do this concurrently,
 one of the updates might be lost.

 All these are only written by one backend, so it should be safe. Note
 that that coding pattern, just without memory barriers, is all over
 pgstat.c

Ah, OK.  If there's a separate slot for each backend, I agree that it's safe.

We should probably add barriers to pgstat.c, too.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

2014-08-14 Thread Robert Haas
On Thu, Aug 14, 2014 at 1:24 PM, Peter Geoghegan p...@heroku.com wrote:
 On Thu, Aug 14, 2014 at 9:13 AM, Robert Haas robertmh...@gmail.com wrote:
 Committed that way.  As the patch is by and large the same as what I
 submitted for this originally, I credited myself as first author and
 you as second author.  I hope that seems fair.

 I think that's more than fair. Thanks!

Great.  BTW, I notice to my chagrin that 'reindex table
some_table_with_an_indexed_text_column' doesn't benefit from this,
apparently because tuplesort_begin_index_btree is used, and it knows
nothing about sortsupport.  I have a feeling there's a good reason for
that, but I don't remember what it is; do you?

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Function to know last log write timestamp

2014-08-14 Thread Andres Freund
On 2014-08-14 14:37:22 -0400, Robert Haas wrote:
 On Thu, Aug 14, 2014 at 2:21 PM, Andres Freund and...@2ndquadrant.com wrote:
  On 2014-08-14 14:19:13 -0400, Robert Haas wrote:
  That's about the idea. However, what you've got there is actually
  unsafe, because shmem-counter++ is not an atomic operation.  It reads
  the counter (possibly even as two separate 4-byte loads if the counter
  is an 8-byte value), increments it inside the CPU, and then writes the
  resulting value back to memory.  If two backends do this concurrently,
  one of the updates might be lost.
 
  All these are only written by one backend, so it should be safe. Note
  that that coding pattern, just without memory barriers, is all over
  pgstat.c
 
 Ah, OK.  If there's a separate slot for each backend, I agree that it's safe.
 
 We should probably add barriers to pgstat.c, too.

Yea, definitely. I think this is rather borked on weaker
architectures. It's just that the consequences of an out of date/torn
value are rather low, so it's unlikely to be noticed.

Imo we should encapsulate the changecount modifications/checks somehow
instead of repeating the barriers, Asserts, comments et al everywhere.

Greetings,

Andres Freund

-- 
 Andres Freund http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

2014-08-14 Thread Peter Geoghegan
On Thu, Aug 14, 2014 at 11:38 AM, Robert Haas robertmh...@gmail.com wrote:
 Great.  BTW, I notice to my chagrin that 'reindex table
 some_table_with_an_indexed_text_column' doesn't benefit from this,
 apparently because tuplesort_begin_index_btree is used, and it knows
 nothing about sortsupport.  I have a feeling there's a good reason for
 that, but I don't remember what it is; do you?

No, I don't, but I'm pretty sure that's because there is no good
reason. I guess the really compelling original sort support functions
were most compelling for the onlyKey case. We can't do that with
B-Tree (at least not without another qsort() specialization, like
qsort_tuple_btree()), because there is additional tie-breaker logic to
sort on item pointer within comparetup_index_btree(). I remember
arguing that that wasn't necessary, because of course I wanted to make
sortsupport as applicable as possible, but I realize in hindsight that
I was probably wrong about that.

Clearly there are still benefits to be had for cluster and B-Tree
tuplesorts. It looks like more or less a simple matter of programming
to me. _bt_mkscankey_nodata() tuplesort call sites like
tuplesort_begin_index_btree() can be taught to produce an equivalent
sortsupport state. I expect that we'll get around to fixing the
problem at some point before too long.

-- 
Peter Geoghegan


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] minor typo in pgbench doc (2)

2014-08-14 Thread Robert Haas
On Tue, Aug 12, 2014 at 4:02 PM, Fabien COELHO coe...@cri.ensmp.fr wrote:
 Yet another very minor typo in pgbench doc.

 I'm not sure of the best way to show formula in the doc.

When I built this, it left a space between the formula and the period.
Fixed that and committed.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Claudio Freire
On Thu, Aug 14, 2014 at 3:49 PM, Larry White ljw1...@gmail.com wrote:
 I attached a json file of approximately 513K. It contains two repetitions of
 a single json structure. The values are quasi-random. It might make a decent
 test case of meaningfully sized data.


I have a 59M in plain SQL (10M compressed, 51M on-disk table size)
collection of real-world JSON data.

This data is mostly counters and anciliary info stored in json for the
flexibility, more than anything else, since it's otherwise quite
structured: most values share a lot between each other (in key names)
but there's not much redundancy within single rows.

Value length stats (in text format):

min: 14
avg: 427
max: 23239

If anyone's interested, contact me personally (I gotta anonimize the
info a bit first, since it's production info, and it's too big to
attach on the ML).


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] TODO : Allow parallel cores to be used by vacuumdb [ WIP ]

2014-08-14 Thread Robert Haas
On Mon, Aug 11, 2014 at 12:59 AM, Amit Kapila amit.kapil...@gmail.com wrote:
 1.
 +Number of parallel connections to perform the operation. This
 option will enable the vacuum
 +operation to run on parallel connections, at a time one table will
 be operated on one connection.

 a. How about describing w.r.t asynchronous connections
 instead of parallel connections?

I don't think asynchronous is a good choice of word.  Maybe simultaneous?

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Immediate standby promotion

2014-08-14 Thread Robert Haas
On Thu, Aug 14, 2014 at 10:12 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Fujii Masao masao.fu...@gmail.com writes:
 I'd like to propose to add new option --immediate to pg_ctl promote.
 When this option is set, recovery ignores any WAL data which have not
 been replayed yet and exits immediately. Patch attached.

 This promotion is faster than normal one but can cause data loss.

 TBH, I cannot imagine a situation where that would be a sane thing to do.
 If you have WAL, why would you not replay what you have?  The purpose
 of a database is to preserve your data, not randomly throw it away.

I've wanted this a number of times, so I think it's quite sane.

 Also imagine the case
 where, while recovery is being delayed (by a time-delayed standby
 which was introduced in 9.4) or paused (by pg_xlog_replay_pause),
 you find that subsequent WAL data can cause a disaster to happen
 (for example, WAL data indicating an unexpected deletion of
 important database). In this case, this immediate promotion can be
 used to ignore such problematic WAL data.

 That example does not demonstrate that a patch like this is useful.
 What you'd presumably want is a way to stop replay at a defined place
 (comparable to the PITR facilities).  Not to just abandon the WAL stream
 at whatever point replay has reached.

We already have the facilities to stop replay at a defined place.  But
then what?  Without this patch, do well tell the customer to stop
replay, do a pg_dump of the whole database, and restore it into a new
database?  Because that's crazy.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enable WAL archiving even in standby

2014-08-14 Thread Robert Haas
On Wed, Aug 13, 2014 at 6:42 AM, Fujii Masao masao.fu...@gmail.com wrote:
 I'd propose the attached WIP patch which allows us to enable WAL archiving
 even in standby. The patch adds always as the valid value of archive_mode.
 If it's set to always, the archiver is started when the server is in standby
 mode and all the WAL files that walreceiver wrote to the disk are archived by
 using archive_command. Then, even after the server is promoted to master,
 the archiver keeps archiving WAL files. The patch doesn't change the meanings
 of the setting values on and off of archive_mode.

I like the feature, but I don't much like this as a control mechanism.
Having archive_command and standby_archive_command, as you propose
further down, seems saner.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] ALTER TABLESPACE MOVE command tag tweak

2014-08-14 Thread Robert Haas
On Wed, Aug 13, 2014 at 9:33 AM, Alvaro Herrera
alvhe...@2ndquadrant.com wrote:
 Stephen Frost wrote:

 * Alvaro Herrera (alvhe...@2ndquadrant.com) wrote:
  Stephen Frost wrote:

   Alright, sounds like this is more-or-less the concensus.  I'll see about
   making it happen shortly.
 
  Were you able to work on this?

 Apologies, I've been gone more-or-less all of July.  I'm back now and
 have time to spend on this.

 Ping?

Seriously!

We really should not be making changes of this type less than a month
from our ostensible release date.  That is not enough time for us to
notice if the changes turn out to be not as good as we think they are.
The whole point of beta is to fix things while there's still enough
time for further course correction if needed; if we say, oh, beta's
not totally over yet, I don't have to get my changes in, then it
completely defeats the purpose of having a beta in the first place.

/rant

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Immediate standby promotion

2014-08-14 Thread Fabrízio de Royes Mello
On Thu, Aug 14, 2014 at 4:27 PM, Robert Haas robertmh...@gmail.com wrote:

 We already have the facilities to stop replay at a defined place.  But
 then what?  Without this patch, do well tell the customer to stop
 replay, do a pg_dump of the whole database, and restore it into a new
 database?  Because that's crazy.


Yeah... and as Fujji already said another case is when some operation error
occurs in the master (like a wrong drop database) and we have a
time-delayed standby that can be used to recover the mistake quickly.


--
Fabrízio de Royes Mello
Consultoria/Coaching PostgreSQL
 Timbira: http://www.timbira.com.br
 Blog sobre TI: http://fabriziomello.blogspot.com
 Perfil Linkedin: http://br.linkedin.com/in/fabriziomello
 Twitter: http://twitter.com/fabriziomello


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tomas Vondra
On 14.8.2014 18:54, Jeff Davis wrote:
 On Thu, 2014-08-14 at 16:17 +0200, Tomas Vondra wrote:
 Either it belongs to the current batch (and either it's in the hash
 table, or you add it there), or it's not - in that case write it to a
 temp file.
 
 I think the part you left out is that you need two files per batch: one
 for the dumped-out partially-computed state values, and one for the
 tuples.
 
 In other words, you haven't really discussed the step where you reunite
 the tuples with that partially-computed state.

No, that's not how the serialize/deserialize should work. The aggregate
needs to store the state as-is, so that after deserializing it gets
pretty much the same thing.

For example, for 'median' the state is the list of all the values
received so far, and when serializing it you have to write all the
values out. After deserializing it, you will get the same list of values.

Some aggregates may use complex data structures that may need more
elaborate serialize.

 For sure, it's not for free - it may write to quite a few files. Is it
 more expensive than what you propose? I'm not sure about that. With
 your batching scheme, you'll end up with lower number of large batches,
 and you'll need to read and split them, possibly repeatedly. The
 batching scheme from hashjoin minimizes this.
 
 My approach only has fewer batches if it elects to have fewer batches,
 which might happen for two reasons:
  1. A cardinality misestimate. This certainly could happen, but we do
 have useful numbers to work from (we know the number of tuples and
 distincts that we've read so far), so it's far from a blind guess. 
  2. We're concerned about the random I/O from way too many partitions.

OK. We can't really do much with the cardinality estimate.

As for the random IO concerns, I did a quick test to see how this
behaves. I used a HP ProLiant DL380 G5 (i.e. a quite old machine, from
2006-09 if I'm not mistaken). 16GB RAM, RAID10 on 6 x 10k SAS drives,
512MB write cache. So a quite lousy machine, considering today's standards.

I used a simple C program (attached) that creates N files, and writes
into them in a round-robin fashion until a particular file size is
reached. I opted for 64GB total size, 1kB writes.

./iotest filecount filesize writesize

File size is in MB, writesize is in bytes. So for example this writes 64
files, each 1GB, using 512B writes.

./iotest 64 1024 512

Measured is duration before/after fsync (in seconds):

files   |file size  |  before  fsync |  after fsync
   -
32  |  2048 |290.16  |  294.33
64  |  1024 |264.68  |  267.60
128 |   512 |278.68  |  283.44
256 |   256 |332.11  |  338.45
1024|64 |419.91  |  425.48
2048|32 |450.37  |  455.20

So while there is a difference, I don't think it's the 'random I/O wall'
as usually observed on rotational drives. Also, this is 2.6.32 kernel,
and my suspicion is that with a newer one the behaviour would be better.

I also have an SSD in that machine (Intel S3700), so I did the same test
with these results:

files   |file size  |  before  fsync |  after fsync
   -
32  |  2048 |445.05  |  464.73
64  |  1024 |447.32  |  466.56
128 |   512 |446.63  |  465.90
256 |   256 |446.64  |  466.19
1024|64 |511.85  |  523.24
2048|32 |579.92  |  590.76

So yes, the number of files matter, but I don't think it's strong enough
to draw a clear line on how many batches we allow. Especially
considering how old this machine is (on 3.x kernels, we usually see much
better performance in I/O intensive conditions).


 I fail to see how this is different from your approach? How can you
 output any tuples before processing the whole inner relation?
 
 Right, the only thing I avoid is scanning the hash table and dumping out
 the groups.
 
 This isn't a major distinction, more like my approach does a little
 less work before returning tuples, and I'm not even sure I can defend
 that, so I'll retract this point.
 
 Your approach is to do multi-level batching, and I was thinking whether
 it'd be possible to use the same approach (single level). But in
 retrospect it probably does not make much sense, because the multi-level
 batching is one of the points of the proposed approach.
 
 Now that I think about it, many of the points we discussed could
 actually work with either approach:
   * In my approach, if I need more partitions, I could create more in
 much the same way as HashJoin to keep it single-level (as you suggest
 above).
   * In your approach, if there are too many partitions, you could avoid
 

Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

2014-08-14 Thread Peter Geoghegan
On Thu, Aug 14, 2014 at 11:55 AM, Peter Geoghegan p...@heroku.com wrote:
 Clearly there are still benefits to be had for cluster and B-Tree
 tuplesorts.

In a world where this general support exists, abbreviated keys could
be made to work with both of those, but not datum tuplesorts, because
that case needs to support tuplesort_getdatum(). Various datum
tuplesort clients expect to be able to retrieve the original
representation stored in SortTuple.datum1, and there isn't much we can
do about that.

This is a bit messy, because now you have heap and datum cases able to
use the onlyKey qsort specialization (iff the opclass doesn't provide
abbreviated key support in the heap case), while all cases except the
datum case support abbreviated keys. It's not that bad though; at
least the onlyKey qsort specialization doesn't have to care about
abbreviated keys, which makes sense because it's generally only
compelling for pass-by-value types.
-- 
Peter Geoghegan


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] replication commands and index terms

2014-08-14 Thread Robert Haas
On Thu, Aug 14, 2014 at 12:59 PM, Fujii Masao masao.fu...@gmail.com wrote:
 Since I sometimes try to search the replication command in the index,
 I'd feel inclined to expose all those commands as index terms...

+1.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] 9.4 logical decoding assertion

2014-08-14 Thread Steve Singer

I hit the following on 9.4 testing logical decoding.


TRAP: FailedAssertion(!(prev_first_lsn  cur_txn-first_lsn), File: 
reorderbuffer.c, Line: 618)

LOG:  server process (PID 3801) was terminated by signal 6: Aborted

Unfortunately I don't have a core file and I haven't been able to 
reproduce this.










--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Immediate standby promotion

2014-08-14 Thread Kevin Grittner
Fabrízio de Royes Mello fabriziome...@gmail.com wrote:
 Robert Haas robertmh...@gmail.com wrote:

 We already have the facilities to stop replay at a defined
 place.  But then what?  Without this patch, do well tell the
 customer to stop replay, do a pg_dump of the whole database, and
 restore it into a new database?  Because that's crazy.

 Yeah... and as Fujji already said another case is when some
 operation error occurs in the master (like a wrong drop
 database) and we have a time-delayed standby that can be used to
 recover the mistake quickly.

I have been in the position of having an ad hoc data fix by someone
running raw SQL where they forgot the WHERE clause on a DELETE from
the table that just about everything joins to (the Case table
for a court system).  Since we had both PITR backups and logical
replication we were able to recover by kicking the users out, doing
a PITR recovery up to shortly before the mistake was made, and then
replaying the logical transaction stream from that point to the
end, excluding the bad transaction.

On the face of it, that doesn't sound like a big deal, right?  But
we had to kick out seven state Supreme Court justices, 16 Court of
Appeals judges, and the related support staff for a couple hours.
Trust me, with a delayed replica and the option of an immediate
promotion of the standby, I would have had a less stressful day.
Instead of telling all those people they couldn't use a key tool in
their workflow for two hours, I could have told them that there
would be a one or two minute outage after which any entries in the
last n minutes would be delayed in appearing in their view of the
data for two hours.  The justices would have been a lot happier,
and when they are happier, so is everyone else.  :-)

The suggested feature seems useful to me.

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Claudio Freire
On Thu, Aug 14, 2014 at 4:24 PM, Claudio Freire klaussfre...@gmail.com wrote:
 On Thu, Aug 14, 2014 at 3:49 PM, Larry White ljw1...@gmail.com wrote:
 I attached a json file of approximately 513K. It contains two repetitions of
 a single json structure. The values are quasi-random. It might make a decent
 test case of meaningfully sized data.


 I have a 59M in plain SQL (10M compressed, 51M on-disk table size)
 collection of real-world JSON data.

 This data is mostly counters and anciliary info stored in json for the
 flexibility, more than anything else, since it's otherwise quite
 structured: most values share a lot between each other (in key names)
 but there's not much redundancy within single rows.

 Value length stats (in text format):

 min: 14
 avg: 427
 max: 23239

 If anyone's interested, contact me personally (I gotta anonimize the
 info a bit first, since it's production info, and it's too big to
 attach on the ML).

Oh, that one has a 13k toast, not very interesting.

But I've got another (very similar), 47M table, 40M toast, length distribution:

min: 19
avg: 474
max: 20370

Not sure why it's got a bigger toast having a similar distribution.
Tells just how meaningless min/avg/max stats are :(


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] LIMIT for UPDATE and DELETE

2014-08-14 Thread Rukh Meski
Greetings,

Based on the feedback on my previous patch, I've separated only the
LIMIT part into its own feature.  This version plays nicely with
inheritance.  The intended use is splitting up big UPDATEs and DELETEs
into batches more easily and efficiently.


♜
*** a/doc/src/sgml/ref/delete.sgml
--- b/doc/src/sgml/ref/delete.sgml
***
*** 25,30  PostgreSQL documentation
--- 25,31 
  DELETE FROM [ ONLY ] replaceable class=PARAMETERtable_name/replaceable 
[ * ] [ [ AS ] replaceable class=parameteralias/replaceable ]
  [ USING replaceable class=PARAMETERusing_list/replaceable ]
  [ WHERE replaceable class=PARAMETERcondition/replaceable | WHERE 
CURRENT OF replaceable class=PARAMETERcursor_name/replaceable ]
+ [ LIMIT { replaceable class=parametercount/replaceable | ALL } ]
  [ RETURNING * | replaceable 
class=parameteroutput_expression/replaceable [ [ AS ] replaceable 
class=parameteroutput_name/replaceable ] [, ...] ]
  /synopsis
   /refsynopsisdiv
***
*** 56,61  DELETE FROM [ ONLY ] replaceable 
class=PARAMETERtable_name/replaceable [ *
--- 57,70 
/para
  
para
+If the literalLIMIT/ (or literalFETCH FIRST/) clause
+is present, processing will stop after the system has deleted the
+specified amount of rows.  Unlike in literalSELECT/, the
+literalOFFSET/literal clause is not available in
+literalDELETE/.
+   /para
+ 
+   para
 The optional literalRETURNING/ clause causes commandDELETE/
 to compute and return value(s) based on each row actually deleted.
 Any expression using the table's columns, and/or columns of other
*** a/doc/src/sgml/ref/update.sgml
--- b/doc/src/sgml/ref/update.sgml
***
*** 29,34  UPDATE [ ONLY ] replaceable 
class=PARAMETERtable_name/replaceable [ * ] [
--- 29,35 
  } [, ...]
  [ FROM replaceable class=PARAMETERfrom_list/replaceable ]
  [ WHERE replaceable class=PARAMETERcondition/replaceable | WHERE 
CURRENT OF replaceable class=PARAMETERcursor_name/replaceable ]
+ [ LIMIT { replaceable class=parametercount/replaceable | ALL } ]
  [ RETURNING * | replaceable 
class=parameteroutput_expression/replaceable [ [ AS ] replaceable 
class=parameteroutput_name/replaceable ] [, ...] ]
  /synopsis
   /refsynopsisdiv
***
*** 51,56  UPDATE [ ONLY ] replaceable 
class=PARAMETERtable_name/replaceable [ * ] [
--- 52,66 
 circumstances.
/para
  
+ 
+   para
+If the literalLIMIT/ (or literalFETCH FIRST/) clause
+is present, processing will stop after the system has updated
+the specified amount of rows.  Unlike in literalSELECT/, the
+literalOFFSET/literal clause is not available in
+literalUPDATE/.
+   /para
+ 
para
 The optional literalRETURNING/ clause causes commandUPDATE/
 to compute and return value(s) based on each row actually updated.
*** a/src/backend/executor/nodeModifyTable.c
--- b/src/backend/executor/nodeModifyTable.c
***
*** 323,329  ExecDelete(ItemPointer tupleid,
   TupleTableSlot *planSlot,
   EPQState *epqstate,
   EState *estate,
!  bool canSetTag)
  {
ResultRelInfo *resultRelInfo;
RelationresultRelationDesc;
--- 323,329 
   TupleTableSlot *planSlot,
   EPQState *epqstate,
   EState *estate,
!  int64_t *processed)
  {
ResultRelInfo *resultRelInfo;
RelationresultRelationDesc;
***
*** 480,487  ldelete:;
 */
}
  
!   if (canSetTag)
!   (estate-es_processed)++;
  
/* AFTER ROW DELETE Triggers */
ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple);
--- 480,486 
 */
}
  
!   (*processed)++;
  
/* AFTER ROW DELETE Triggers */
ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple);
***
*** 572,578  ExecUpdate(ItemPointer tupleid,
   TupleTableSlot *planSlot,
   EPQState *epqstate,
   EState *estate,
!  bool canSetTag)
  {
HeapTuple   tuple;
ResultRelInfo *resultRelInfo;
--- 571,577 
   TupleTableSlot *planSlot,
   EPQState *epqstate,
   EState *estate,
!  int64_t *processed)
  {
HeapTuple   tuple;
ResultRelInfo *resultRelInfo;
***
*** 771,778  lreplace:;

   estate);
}
  
!   if (canSetTag)
!   (estate-es_processed)++;
  
/* AFTER ROW UPDATE Triggers */
ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, tuple,
--- 770,776 

 

Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Peter Geoghegan p...@heroku.com writes:
 On Thu, Aug 14, 2014 at 10:57 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Maybe this is telling us it's not worth changing the representation,
 and we should just go do something about the first_success_by threshold
 and be done.  I'm hesitant to draw such conclusions on the basis of a
 single use-case though, especially one that doesn't really have that
 much use for compression in the first place.  Do we have other JSON
 corpuses to look at?

 Yes. Pavel posted some representative JSON data a while back:
 http://pgsql.cz/data/data.dump.gz (it's a plain dump)

I did some quick stats on that.  206560 rows

min max avg

external text representation220 172685  880.3

JSON representation (compressed text)   224 78565   541.3

pg_column_size, JSONB HEAD repr.225 82540   639.0

pg_column_size, all-lengths repr.   225 66794   531.1

So in this data, there definitely is some scope for compression:
just compressing the text gets about 38% savings.  The all-lengths
hack is able to beat that slightly, but the all-offsets format is
well behind at 27%.

Not sure what to conclude.  It looks from both these examples like
we're talking about a 10 to 20 percent size penalty for JSON objects
that are big enough to need compression.  Is that beyond our threshold
of pain?  I'm not sure, but there is definitely room to argue that the
extra I/O costs will swamp any savings we get from faster access to
individual fields or array elements.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Gavin Flower

On 15/08/14 09:47, Tom Lane wrote:

Peter Geoghegan p...@heroku.com writes:

On Thu, Aug 14, 2014 at 10:57 AM, Tom Lane t...@sss.pgh.pa.us wrote:

Maybe this is telling us it's not worth changing the representation,
and we should just go do something about the first_success_by threshold
and be done.  I'm hesitant to draw such conclusions on the basis of a
single use-case though, especially one that doesn't really have that
much use for compression in the first place.  Do we have other JSON
corpuses to look at?

Yes. Pavel posted some representative JSON data a while back:
http://pgsql.cz/data/data.dump.gz (it's a plain dump)

I did some quick stats on that.  206560 rows

min max avg

external text representation220 172685  880.3

JSON representation (compressed text)   224 78565   541.3

pg_column_size, JSONB HEAD repr.225 82540   639.0

pg_column_size, all-lengths repr.   225 66794   531.1

So in this data, there definitely is some scope for compression:
just compressing the text gets about 38% savings.  The all-lengths
hack is able to beat that slightly, but the all-offsets format is
well behind at 27%.

Not sure what to conclude.  It looks from both these examples like
we're talking about a 10 to 20 percent size penalty for JSON objects
that are big enough to need compression.  Is that beyond our threshold
of pain?  I'm not sure, but there is definitely room to argue that the
extra I/O costs will swamp any savings we get from faster access to
individual fields or array elements.

regards, tom lane


Curious, would adding the standard deviation help in characterising the 
distribution of data values?


Also you might like to consider additionally using the median value, and 
possibly the 25%  75% (or some such) values.  I assume the 'avg' in 
your table, refers to the arithmetic mean.  Sometimes the median is a 
better meaure of 'normal' than the arithmetic mean, and it can be useful 
to note the difference between the two!


Graphing the values may also be useful.  You might have 2, or more, 
distinct populations which might show up as several distinct peaks - in 
which case, this might suggest changes to the algorithm.


Many moons ago, I did a 400 level statistics course at University, of 
which I've forgotten most.  However, I'm aware of other potentially 
useful measure, but I suspect that they would be too esoteric for the 
current problem!



Cheers,
Gavin



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] ALTER TABLESPACE MOVE command tag tweak

2014-08-14 Thread Tom Lane
Robert Haas robertmh...@gmail.com writes:
 We really should not be making changes of this type less than a month
 from our ostensible release date.  That is not enough time for us to
 notice if the changes turn out to be not as good as we think they are.

If it weren't for the fact that we'll be wedded forevermore to a bad
choice of syntax, I might agree with you.  But at this point, the
alternatives we have are to fix it now, or fix it never.  I don't
like #2.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] ALTER TABLESPACE MOVE command tag tweak

2014-08-14 Thread Stephen Frost
* Tom Lane (t...@sss.pgh.pa.us) wrote:
 Robert Haas robertmh...@gmail.com writes:
  We really should not be making changes of this type less than a month
  from our ostensible release date.  That is not enough time for us to
  notice if the changes turn out to be not as good as we think they are.
 
 If it weren't for the fact that we'll be wedded forevermore to a bad
 choice of syntax, I might agree with you.  But at this point, the
 alternatives we have are to fix it now, or fix it never.  I don't
 like #2.

I'm planning to fix it shortly, as I mentioned to Alvaro on IRC when I
saw his note.

Thanks,

Stephen


signature.asc
Description: Digital signature


Re: [HACKERS] delta relations in AFTER triggers

2014-08-14 Thread Kevin Grittner
Amit Khandekar amit.khande...@enterprisedb.com wrote:

 The execution level itself was almost trivial; it's getting the
 tuplestore reference through the parse analysis and planning
 phases that is painful for me.
 I am not sure why you think we would need to refer the
 tuplestore in the parse analysis and planner phases. It seems
 that we would need them only in execution phase. Or may be I
 didn't understand your point.
 Ah I think I understand now. That might be because you are
 thinking of having an infrastructure common to triggers and
 materialized views, right ?

Well, it's more immediate than that.  The identifiers in the
trigger function are not resolved to particular objects until there
is a request to fire the trigger.  At that time the parse analysis
needs to find the name defined somewhere.  It's not defined in the
catalogs like a table or view, and it's not defined in the query
itself like a CTE or VALUES clause.  The names specified in trigger
creation must be recognized as needing to resolve to the new
TuplestoreScan, and it needs to match those to the tuplestores
themselves.  Row counts, costing, etc. needs to be provided so the
optimizer can pick a good plan in what might be a complex query
with many options.  I'm finding the planner work here to be harder
than everything else put together.

On the bright side, once I'm done, I might know enough about the
planner to do things a lot faster next time.  :-)

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Josh Berkus
So, here's a destruction test case:

200,000 JSON values (plus 2 key columns)
Average width 4K (+/- 1K)
183 keys per JSON value
keys 10 to 30 characters
105 float values
70 integer values
8 text and date values
no nesting

The jsonic table is JSON
The jsonbish table is JSONB

(I can't share this data set, but it makes a good test case)

And, we see the effect:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));
 pg_size_pretty

 394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
 pg_size_pretty

 1147 MB
(1 row)

So, pretty bad; JSONB is 200% larger than JSON.

I don't think having 183 top-level keys is all that unreasonable of a
use case.  Some folks will be migrating from Mongo, Redis or Couch to
PostgreSQL, and might have a whole denormalized schema in JSON.

BTW, I find this peculiar:

postgres=# select pg_size_pretty(pg_relation_size('jsonic'));

 pg_size_pretty

 383 MB
(1 row)

postgres=# select pg_size_pretty(pg_relation_size('jsonbish'));

 pg_size_pretty

 11 MB
(1 row)

Next up: Tom's patch and indexing!

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Josh Berkus j...@agliodbs.com writes:
 So, here's a destruction test case:
 200,000 JSON values (plus 2 key columns)
 Average width 4K (+/- 1K)
 183 keys per JSON value

Is that 183 keys exactly each time, or is 183 the average?
If so, what's the min/max number of keys?

I ask because 183 would be below the threshold where I'd expect the
no-compression behavior to kick in.

 And, we see the effect:

 postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));
  pg_size_pretty
 
  394 MB
 (1 row)

 postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
  pg_size_pretty
 
  1147 MB
 (1 row)

 So, pretty bad; JSONB is 200% larger than JSON.

Ouch.  But it's not clear how much of this is from the first_success_by
threshold and how much is from having poor compression even though we
escaped that trap.

 BTW, I find this peculiar:

 postgres=# select pg_size_pretty(pg_relation_size('jsonic'));

  pg_size_pretty
 
  383 MB
 (1 row)

 postgres=# select pg_size_pretty(pg_relation_size('jsonbish'));

  pg_size_pretty
 
  11 MB
 (1 row)

pg_relation_size is just the main data fork; it excludes TOAST.
So what we can conclude is that most of the data got toasted out-of-line
in jsonb, while very little did in json.  That probably just comes from
the average datum size being close to the push-out-of-line threshold,
so that worse compression puts it over the edge.

It would be useful to see min/max/avg of pg_column_size() in both
these cases.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Josh Berkus
On 08/14/2014 04:02 PM, Tom Lane wrote:
 Josh Berkus j...@agliodbs.com writes:
 So, here's a destruction test case:
 200,000 JSON values (plus 2 key columns)
 Average width 4K (+/- 1K)
 183 keys per JSON value
 
 Is that 183 keys exactly each time, or is 183 the average?

Each time exactly.

 It would be useful to see min/max/avg of pg_column_size() in both
 these cases.

Well, this is 9.4, so I can do better than that.  How about quartiles?

 thetype |colsize_distribution
-+
 json| {1777,1803,1890,1940,4424}
 jsonb   | {5902,5926,5978,6002,6208}

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Josh Berkus j...@agliodbs.com writes:
 On 08/14/2014 04:02 PM, Tom Lane wrote:
 It would be useful to see min/max/avg of pg_column_size() in both
 these cases.

 Well, this is 9.4, so I can do better than that.  How about quartiles?

  thetype |colsize_distribution
 -+
  json| {1777,1803,1890,1940,4424}
  jsonb   | {5902,5926,5978,6002,6208}

OK.  That matches with the observation about being mostly toasted or not
--- the threshold for pushing out-of-line would be something a little
under 2KB depending on the other columns you had in the table.

What's more, it looks like the jsonb data is pretty much never getting
compressed --- the min is too high for that.  So I'm guessing that this
example is mostly about the first_success_by threshold preventing any
compression from happening.  Please, before looking at my other patch,
try this: in src/backend/utils/adt/pg_lzcompress.c, change line 221
thusly:

-   1024,   /* Give up if no 
compression in the first 1KB */
+   INT_MAX,/* Give up if no 
compression in the first 1KB */

then reload the jsonb data and give us the same stats on that.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] 9.5: Memory-bounded HashAgg

2014-08-14 Thread Tomas Vondra
On 14.8.2014 21:47, Tomas Vondra wrote:
 On 14.8.2014 18:54, Jeff Davis wrote:
 On Thu, 2014-08-14 at 16:17 +0200, Tomas Vondra wrote:
 Either it belongs to the current batch (and either it's in the hash
 table, or you add it there), or it's not - in that case write it to a
 temp file.

 I think the part you left out is that you need two files per batch: one
 for the dumped-out partially-computed state values, and one for the
 tuples.

 In other words, you haven't really discussed the step where you reunite
 the tuples with that partially-computed state.
 
 No, that's not how the serialize/deserialize should work. The aggregate
 needs to store the state as-is, so that after deserializing it gets
 pretty much the same thing.
 
 For example, for 'median' the state is the list of all the values
 received so far, and when serializing it you have to write all the
 values out. After deserializing it, you will get the same list of values.
 
 Some aggregates may use complex data structures that may need more
 elaborate serialize.
 
 For sure, it's not for free - it may write to quite a few files. Is it
 more expensive than what you propose? I'm not sure about that. With
 your batching scheme, you'll end up with lower number of large batches,
 and you'll need to read and split them, possibly repeatedly. The
 batching scheme from hashjoin minimizes this.

 My approach only has fewer batches if it elects to have fewer batches,
 which might happen for two reasons:
  1. A cardinality misestimate. This certainly could happen, but we do
 have useful numbers to work from (we know the number of tuples and
 distincts that we've read so far), so it's far from a blind guess. 
  2. We're concerned about the random I/O from way too many partitions.
 
 OK. We can't really do much with the cardinality estimate.
 
 As for the random IO concerns, I did a quick test to see how this
 behaves. I used a HP ProLiant DL380 G5 (i.e. a quite old machine, from
 2006-09 if I'm not mistaken). 16GB RAM, RAID10 on 6 x 10k SAS drives,
 512MB write cache. So a quite lousy machine, considering today's standards.
 
 I used a simple C program (attached) that creates N files, and writes
 into them in a round-robin fashion until a particular file size is
 reached. I opted for 64GB total size, 1kB writes.
 
 ./iotest filecount filesize writesize
 
 File size is in MB, writesize is in bytes. So for example this writes 64
 files, each 1GB, using 512B writes.
 
 ./iotest 64 1024 512
 
 Measured is duration before/after fsync (in seconds):
 
 files   |file size  |  before  fsync |  after fsync
-
 32  |  2048 |290.16  |  294.33
 64  |  1024 |264.68  |  267.60
 128 |   512 |278.68  |  283.44
 256 |   256 |332.11  |  338.45
 1024|64 |419.91  |  425.48
 2048|32 |450.37  |  455.20
 
 So while there is a difference, I don't think it's the 'random I/O wall'
 as usually observed on rotational drives. Also, this is 2.6.32 kernel,
 and my suspicion is that with a newer one the behaviour would be better.
 
 I also have an SSD in that machine (Intel S3700), so I did the same test
 with these results:
 
 files   |file size  |  before  fsync |  after fsync
-
 32  |  2048 |445.05  |  464.73
 64  |  1024 |447.32  |  466.56
 128 |   512 |446.63  |  465.90
 256 |   256 |446.64  |  466.19
 1024|64 |511.85  |  523.24
 2048|32 |579.92  |  590.76
 
 So yes, the number of files matter, but I don't think it's strong enough
 to draw a clear line on how many batches we allow. Especially
 considering how old this machine is (on 3.x kernels, we usually see much
 better performance in I/O intensive conditions).

And just for fun, I did the same test on a workstation with 8GB of RAM,
S3700 SSD, i5-2500 CPU and kernel 3.12. That is, a more modern
hardware / kernel / ... compared to the machine above.

For a test writing 32GB of data (4x the RAM), I got these results:

files   | file size  | before  fsync |  after fsync
   --
 32 |1024| 171.27|175.96
 64 | 512| 165.57|170.12
128 | 256| 165.29|169.95
256 | 128| 164.69|169.62
512 |  64| 163.98|168.90
   1024 |  32| 165.44|170.50
   2048 |  16| 165.97|171.35
   4096 |   8| 166.55|173.26

So, no sign of slowdown at all, in this case. I don't have a rotational
disk in the machine at this 

Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Josh Berkus

 What's more, it looks like the jsonb data is pretty much never getting
 compressed --- the min is too high for that.  So I'm guessing that this
 example is mostly about the first_success_by threshold preventing any
 compression from happening.  Please, before looking at my other patch,
 try this: in src/backend/utils/adt/pg_lzcompress.c, change line 221
 thusly:
 
 - 1024,   /* Give up if no 
 compression in the first 1KB */
 + INT_MAX,/* Give up if no 
 compression in the first 1KB */
 
 then reload the jsonb data and give us the same stats on that.

That helped things, but not as much as you'd think:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));

 pg_size_pretty

 394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
 pg_size_pretty

 801 MB
(1 row)

What I find really strange is that the column size distribution is
exactly the same:

 thetype |colsize_distribution
-+
 json| {1777,1803,1890,1940,4424}
 jsonb   | {5902,5926,5978,6002,6208}

Shouldn't the lower end stuff be smaller?

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Josh Berkus
On 08/14/2014 04:47 PM, Josh Berkus wrote:
  thetype |colsize_distribution
 -+
  json| {1777,1803,1890,1940,4424}
  jsonb   | {5902,5926,5978,6002,6208}

Just realized my query was counting the whole row size instead of just
the column size.  Here's just the JSON column:

Before changing to to INT_MAX:

 thetype |colsize_distribution
-+
 json| {1741,1767,1854,1904,2292}
 jsonb   | {3551,5866,5910,5958,6168}

After:

 thetype |colsize_distribution
-+
 json| {1741,1767,1854,1904,2292}
 jsonb   | {3515,3543,3636,3690,4038}

So that did improve things, just not as much as we'd like.

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Reporting the commit LSN at commit time

2014-08-14 Thread Craig Ringer
On 08/15/2014 12:21 AM, Robert Haas wrote:
 On Sat, Aug 9, 2014 at 12:54 PM, Andres Freund and...@2ndquadrant.com wrote:

 I don't have a very clear idea whether this is a good idea in any form
 because I can't quite imagine how this is going to be used by the
 client without adding an unwarranted amount of complexity there.

Full automatic transparent failover _will_ be complex on the client. No
denying that. The hard parts are picking which node to connect to when
one goes away, the decision making around what to do when the new node
fails to catch up to the last committed state on the old node, and
tracking session state.

There are some quite simple uses too though. The main one of interest to
me is an app that routes read-only queries to an async read-replica and
wants to guarantee that some of them see a state consistent with the
last commit on the master.

It's the first thing many people have asked me about BDR, though. How
does client-side failover work.  This is a priority for a lot of people.

As far as I can see, if you have client-side failover with asynchronous
replication of any form, the client _must_ have some way to reliably
connect to a new node and ask it are you caught up to the state of the
last node I was connected to yet?. Or Please wait until the last
transaction I committed elsewhere is visible here.

The client must keep track of some kind of information that indicates
the last node it talked to and identifies the last transaction it
committed. (Client could mean proxy in the case of a failover-aware
pgbouncer.)

 So, even accepting for the moment the premise that the basic idea here
 is good, I think the benefits would have to be monumental to convince
 me that a protocol change is a good idea.  If we do anything like
 that, we'll be hearing about the downstream damage for years.

Yes, that's a real concern.

PgJDBC and psqlODBC both implement the wire protocol themselves. PgJDBC
does because it's a type 4 JDBC driver (pure Java, no native code, no
recompile required). I don't understand why psqlODBC goes its own way
instead of using libpq, but it does.

There are also numerous language-specific pure-language bindings, though
half of them seem pretty close to unmaintained.

That's why I proposed a new protocol message carrying extra info, that
clients can optionally request only if they understand it. Nobody else
needs to care or notice that anything's new.

The v2 to v3 protocol switch has only now reached the point where it's
realistic to to drop v2 support from clients. I'm hardly keen to do
another protocol rev, especially for something as minor as this.


-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Reporting the commit LSN at commit time

2014-08-14 Thread Josh Berkus
On 08/14/2014 05:45 PM, Craig Ringer wrote:

 Wouldn't that force client drivers - libpq, psqlODBC, PgJDBC, etc - to
 all watch for explicit COMMITs sent by the application and rewrite them?

Realistically, users are going to need new drivers to take advantage of
any automated connection failover anyway.

 Full automatic transparent failover _will_ be complex on the client. No
 denying that. The hard parts are picking which node to connect to when
 one goes away, the decision making around what to do when the new node
 fails to catch up to the last committed state on the old node, and
 tracking session state.

Frankly, I'd love to see just the simplest version of this implemented
in libpq as a start: the ability for client drivers to take a list of
hosts instead of a singe hostaddr (this was discussed at the 2013
clustering meeting).

 There are some quite simple uses too though. The main one of interest to
 me is an app that routes read-only queries to an async read-replica and
 wants to guarantee that some of them see a state consistent with the
 last commit on the master.
 
 It's the first thing many people have asked me about BDR, though. How
 does client-side failover work.  This is a priority for a lot of people.

 As far as I can see, if you have client-side failover with asynchronous
 replication of any form, the client _must_ have some way to reliably
 connect to a new node and ask it are you caught up to the state of the
 last node I was connected to yet?. Or Please wait until the last
 transaction I committed elsewhere is visible here.

There are quite a few use-cases where this information isn't required;
even for BDR, I'd love to see the ability to disable this check.

There's also cases where it's not adequate; the user may not have
committed anything on the master, but they still don't want to connect
to a replica which is hours behind the last node they queried.

There's also use-cases for which automated connection failover without a
managed proxy is a Seriously Bad Idea.  For one thing, you're setting up
a strong risk of split-brain.

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Josh Berkus

 Before changing to to INT_MAX:
 
  thetype |colsize_distribution
 -+
  json| {1741,1767,1854,1904,2292}
  jsonb   | {3551,5866,5910,5958,6168}
 
 After:
 
  thetype |colsize_distribution
 -+
  json| {1741,1767,1854,1904,2292}
  jsonb   | {3515,3543,3636,3690,4038}
 
 So that did improve things, just not as much as we'd like.

And with Tom's test patch:

postgres=# select pg_size_pretty(pg_total_relation_size('jsonic'));

 pg_size_pretty

 394 MB
(1 row)

postgres=# select pg_size_pretty(pg_total_relation_size('jsonbish'));
 pg_size_pretty

 541 MB
(1 row)

 thetype |colsize_distribution
-+
 json| {1741,1767,1854,1904,2292}
 jsonb   | {2037,2114,2288,2348,2746}

Since that improved things a *lot*, just +40% instead of +200%, I
thought I'd test some select queries.  I decided to test a GIN lookup
and value extraction, since indexed lookup is really what I care about.

9.4b2 no patches:

postgres=# explain analyze select row_to_json - 'kt1_total_sum' from
jsonbish where row_to_json @ '{ rpt_per_dt : 2003-06-30 }';
QUERY PLAN
---
 Bitmap Heap Scan on jsonbish  (cost=29.55..582.92 rows=200 width=18)
(actual time=20.814..2845.454 rows=100423 loops=1)
   Recheck Cond: (row_to_json @ '{rpt_per_dt: 2003-06-30}'::jsonb)
   Heap Blocks: exact=1471
   -  Bitmap Index Scan on jsonbish_row_to_json_idx  (cost=0.00..29.50
rows=200 width=0) (actual time=20.551..20.551 rows=100423 loops=1)
 Index Cond: (row_to_json @ '{rpt_per_dt: 2003-06-30}'::jsonb)
 Planning time: 0.102 ms
 Execution time: 2856.179 ms


9.4b2 TL patch:

postgres=# explain analyze select row_to_json - 'kt1_total_sum' from
jsonbish where row_to_json @ '{ rpt_per_dt : 2003-06-30 }';
QUERY
PLAN
---
 Bitmap Heap Scan on jsonbish  (cost=29.55..582.92 rows=200 width=18)
(actual time=24.071..5201.687 rows=100423 loops=1)
   Recheck Cond: (row_to_json @ '{rpt_per_dt: 2003-06-30}'::jsonb)
   Heap Blocks: exact=1471
   -  Bitmap Index Scan on jsonbish_row_to_json_idx  (cost=0.00..29.50
rows=200 width=0) (actual time=23.779..23.779 rows=100423 loops=1)
 Index Cond: (row_to_json @ '{rpt_per_dt: 2003-06-30}'::jsonb)
 Planning time: 0.098 ms
 Execution time: 5214.212 ms

... so, an 80% increase in lookup and extraction time for swapping
offsets for lengths.  That's actually all extraction time; I tried
removing the extraction from the query, and without it both queries are
close enough to be statstically insignificant.

-- 
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


run xmllint during build (was Re: [HACKERS] need xmllint on borka)

2014-08-14 Thread Peter Eisentraut
On 5/6/14 10:20 PM, Peter Eisentraut wrote:
 Agreed.  I have committed the SGML changes that make things valid now,
 but I will postpone the xmllint addition until the 9.5 branch, complete
 with more documentation.

Per the above announcement, here is an updated patch, also with more
documentation and explanations.

It would especially be valuable if someone with a different-than-mine OS
would verify whether they can install xmllint according to the
documentation, or update the documentation if not.

From 27f1fabb4afb32ec44373100ab438277ebdac806 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut pete...@gmx.net
Date: Thu, 14 Aug 2014 21:48:44 -0400
Subject: [PATCH] doc: Check DocBook XML validity during the build

Building the documentation with XSLT does not check the DTD, like a
DSSSL build would.  One can often get away with having invalid XML, but
the stylesheets might then create incorrect output, as they are not
designed to handle that.  Therefore, check the validity of the XML
against the DTD, using xmllint, during the build.

Add xmllint detection to configure, and add some documentation.

xmllint comes with libxml2, which is already in use, but it might be in
a separate package, such as libxml2-utils on Debian.
---
 configure  | 43 +++
 configure.in   |  1 +
 doc/src/sgml/Makefile  |  9 -
 doc/src/sgml/docguide.sgml | 19 +--
 src/Makefile.global.in |  1 +
 5 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 0f435b5..57c2455 100755
--- a/configure
+++ b/configure
@@ -630,6 +630,7 @@ vpath_build
 PROVE
 OSX
 XSLTPROC
+XMLLINT
 COLLATEINDEX
 DOCBOOKSTYLE
 have_docbook
@@ -14415,6 +14416,48 @@ fi
 
 
 fi
+for ac_prog in xmllint
+do
+  # Extract the first word of $ac_prog, so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo $as_me:${as_lineno-$LINENO}: checking for $ac_word 5
+$as_echo_n checking for $ac_word...  6; }
+if ${ac_cv_prog_XMLLINT+:} false; then :
+  $as_echo_n (cached)  6
+else
+  if test -n $XMLLINT; then
+  ac_cv_prog_XMLLINT=$XMLLINT # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z $as_dir  as_dir=.
+for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p $as_dir/$ac_word$ac_exec_ext; then
+ac_cv_prog_XMLLINT=$ac_prog
+$as_echo $as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext 5
+break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+XMLLINT=$ac_cv_prog_XMLLINT
+if test -n $XMLLINT; then
+  { $as_echo $as_me:${as_lineno-$LINENO}: result: $XMLLINT 5
+$as_echo $XMLLINT 6; }
+else
+  { $as_echo $as_me:${as_lineno-$LINENO}: result: no 5
+$as_echo no 6; }
+fi
+
+
+  test -n $XMLLINT  break
+done
+
 for ac_prog in xsltproc
 do
   # Extract the first word of $ac_prog, so it can be a program name with args.
diff --git a/configure.in b/configure.in
index f8a4507..0d963c6 100644
--- a/configure.in
+++ b/configure.in
@@ -1864,6 +1864,7 @@ PGAC_PROG_JADE
 PGAC_CHECK_DOCBOOK(4.2)
 PGAC_PATH_DOCBOOK_STYLESHEETS
 PGAC_PATH_COLLATEINDEX
+AC_CHECK_PROGS(XMLLINT, xmllint)
 AC_CHECK_PROGS(XSLTPROC, xsltproc)
 AC_CHECK_PROGS(OSX, [osx sgml2xml sx])
 
diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile
index 271c700..d2517f2 100644
--- a/doc/src/sgml/Makefile
+++ b/doc/src/sgml/Makefile
@@ -40,6 +40,10 @@ ifndef OSX
 OSX = osx
 endif
 
+ifndef XMLLINT
+XMLLINT = xmllint
+endif
+
 ifndef XSLTPROC
 XSLTPROC = xsltproc
 endif
@@ -76,6 +80,7 @@ override SPFLAGS += -wall -wno-unused-param -wno-empty -wfully-tagged
 man distprep-man: man-stamp
 
 man-stamp: stylesheet-man.xsl postgres.xml
+	$(XMLLINT) --noout --valid postgres.xml
 	$(XSLTPROC) $(XSLTPROCFLAGS) $(XSLTPROC_MAN_FLAGS) $^
 	touch $@
 
@@ -252,11 +257,13 @@ endif
 xslthtml: xslthtml-stamp
 
 xslthtml-stamp: stylesheet.xsl postgres.xml
+	$(XMLLINT) --noout --valid postgres.xml
 	$(XSLTPROC) $(XSLTPROCFLAGS) $(XSLTPROC_HTML_FLAGS) $^
 	cp $(srcdir)/stylesheet.css html/
 	touch $@
 
 htmlhelp: stylesheet-hh.xsl postgres.xml
+	$(XMLLINT) --noout --valid postgres.xml
 	$(XSLTPROC) $(XSLTPROCFLAGS) $^
 
 %-A4.fo.tmp: stylesheet-fo.xsl %.xml
@@ -266,7 +273,6 @@ htmlhelp: stylesheet-hh.xsl postgres.xml
 	$(XSLTPROC) $(XSLTPROCFLAGS) --stringparam paper.type USletter -o $@ $^
 
 FOP = fop
-XMLLINT = xmllint
 
 # reformat FO output so that locations of errors are easier to find
 %.fo: %.fo.tmp
@@ -279,6 +285,7 @@ XMLLINT = xmllint
 
 epub: postgres.epub
 postgres.epub: postgres.xml
+	$(XMLLINT) --noout --valid $
 	$(DBTOEPUB) $
 
 
diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml
index 816375f..2fe325e 100644
--- a/doc/src/sgml/docguide.sgml
+++ b/doc/src/sgml/docguide.sgml
@@ -149,6 +149,20 @@ titleTool Sets/title
 /varlistentry
 
 varlistentry
+ termulink url=http://xmlsoft.org/;Libxml2/ulink for 

Re: [HACKERS] jsonb format is pessimal for toast compression

2014-08-14 Thread Tom Lane
Josh Berkus j...@agliodbs.com writes:
 And with Tom's test patch:
 ...
 Since that improved things a *lot*, just +40% instead of +200%, I
 thought I'd test some select queries.

That test patch is not meant to be fast, its ambition was only to see
what the effects on storage size would be.  So I find this unsurprising:

 ... so, an 80% increase in lookup and extraction time for swapping
 offsets for lengths.

We can certainly reduce that.  The question was whether it would be
worth the effort to try.  At this point, with three different test
data sets having shown clear space savings, I think it is worth
the effort.  I'll poke into it tomorrow or over the weekend, unless
somebody beats me to it.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] replication commands and index terms

2014-08-14 Thread Fujii Masao
On Fri, Aug 15, 2014 at 4:59 AM, Robert Haas robertmh...@gmail.com wrote:
 On Thu, Aug 14, 2014 at 12:59 PM, Fujii Masao masao.fu...@gmail.com wrote:
 Since I sometimes try to search the replication command in the index,
 I'd feel inclined to expose all those commands as index terms...

 +1.

Attached patch exposes replication commands as index terms.
Barring any objection, I will commit this.

Regards,

-- 
Fujii Masao
*** a/doc/src/sgml/protocol.sgml
--- b/doc/src/sgml/protocol.sgml
***
*** 1327,1333  psql dbname=postgres replication=database -c IDENTIFY_SYSTEM;
  The commands accepted in walsender mode are:
  variablelist
varlistentry
! termIDENTIFY_SYSTEM/term
  listitem
   para
Requests the server to identify itself. Server replies with a result
--- 1327,1335 
  The commands accepted in walsender mode are:
  variablelist
varlistentry
! termIDENTIFY_SYSTEM
!  indextermprimaryIDENTIFY_SYSTEM/primary/indexterm
! /term
  listitem
   para
Requests the server to identify itself. Server replies with a result
***
*** 1390,1396  The commands accepted in walsender mode are:
/varlistentry
  
varlistentry
! termTIMELINE_HISTORY replaceable class=parametertli/replaceable/term
  listitem
   para
Requests the server to send over the timeline history file for timeline
--- 1392,1400 
/varlistentry
  
varlistentry
! termTIMELINE_HISTORY replaceable class=parametertli/replaceable
!  indextermprimaryTIMELINE_HISTORY/primary/indexterm
! /term
  listitem
   para
Requests the server to send over the timeline history file for timeline
***
*** 1406,1412  The commands accepted in walsender mode are:
/term
listitem
para
!Filename of the timeline history file, e.g 0002.history.
/para
/listitem
/varlistentry
--- 1410,1416 
/term
listitem
para
!Filename of the timeline history file, e.g filename0002.history/.
/para
/listitem
/varlistentry
***
*** 1428,1434  The commands accepted in walsender mode are:
/varlistentry
  
varlistentry
! termCREATE_REPLICATION_SLOT replaceable class=parameterslot_name/ { literalPHYSICAL/ | literalLOGICAL/ replaceable class=parameteroutput_plugin/ } indextermprimaryCREATE_REPLICATION_SLOT/primary/indexterm/term
  listitem
   para
Create a physical or logical replication
--- 1432,1440 
/varlistentry
  
varlistentry
! termCREATE_REPLICATION_SLOT replaceable class=parameterslot_name/ { literalPHYSICAL/ | literalLOGICAL/ replaceable class=parameteroutput_plugin/ }
!  indextermprimaryCREATE_REPLICATION_SLOT/primary/indexterm
! /term
  listitem
   para
Create a physical or logical replication
***
*** 1460,1466  The commands accepted in walsender mode are:
/varlistentry
  
varlistentry
! termSTART_REPLICATION [literalSLOT/literal replaceable class=parameterslot_name/] [literalPHYSICAL/literal] replaceable class=parameterXXX/XXX/ [literalTIMELINE/literal replaceable class=parametertli/]/term
  listitem
   para
Instructs server to start streaming WAL, starting at
--- 1466,1474 
/varlistentry
  
varlistentry
! termSTART_REPLICATION [literalSLOT/literal replaceable class=parameterslot_name/] [literalPHYSICAL/literal] replaceable class=parameterXXX/XXX/ [literalTIMELINE/literal replaceable class=parametertli/]
!  indextermprimarySTART_REPLICATION/primary/indexterm
! /term
  listitem
   para
Instructs server to start streaming WAL, starting at
***
*** 1850,1856  The commands accepted in walsender mode are:
/varlistentry
  
varlistentry
! termDROP_REPLICATION_SLOT replaceable class=parameterslot_name//term
  listitem
   para
Drops a replication slot, freeing any reserved server-side resources. If
--- 1858,1866 
/varlistentry
  
varlistentry
! termDROP_REPLICATION_SLOT replaceable class=parameterslot_name/
!  indextermprimaryDROP_REPLICATION_SLOT/primary/indexterm
! /term
  listitem
   para
Drops a replication slot, freeing any reserved server-side resources. If
***
*** 1870,1876  The commands accepted in walsender mode are:
/varlistentry
  
varlistentry
! termBASE_BACKUP [literalLABEL/literal replaceable'label'/replaceable] [literalPROGRESS/literal] [literalFAST/literal] [literalWAL/literal] [literalNOWAIT/literal] [literalMAX_RATE/literal replaceablerate/replaceable]/term
  listitem
   para
Instructs the server to start streaming a base backup.
--- 1880,1888 
/varlistentry
  
varlistentry
! termBASE_BACKUP [literalLABEL/literal replaceable'label'/replaceable] [literalPROGRESS/literal] [literalFAST/literal] 

Re: run xmllint during build (was Re: [HACKERS] need xmllint on borka)

2014-08-14 Thread Tom Lane
Peter Eisentraut pete...@gmx.net writes:
 It would especially be valuable if someone with a different-than-mine OS
 would verify whether they can install xmllint according to the
 documentation, or update the documentation if not.

FWIW, xmllint appears to be part of the base libxml2 package on Red Hat
(checked on RHEL6 and Fedora rawhide).

I'm not sure I'd phrase it like this:

+   This library and the commandxmllint/command tool it contains are
+   used for processing XML.  Many developers will already

The library surely does not contain the tool; more like vice versa.
Perhaps provides would be a better verb.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] alter user set local_preload_libraries.

2014-08-14 Thread Kyotaro HORIGUCHI
Hello, I made a set of patches to fix this issue.

The attached files are the followings,

0001_Add_PGC_BACKEND_USERSET_v0.patch:

  Add new GUC category PGC_BACKEND_USERSET and change
  local_preload_libraries to use it.

0002_dblink_follow_change_of_set_config_options_v0.patch:
0003_postgres_fdw_follow_change_of_set_config_options_v0.patch

  Change contrib modules to follow the change of
  set_config_options.

regards,
-- 
Kyoaro Horiguchi
NTT Open Source Software Center

On Thu, Jul 3, 2014 at 1:05 PM, Kyotaro HORIGUCHI
horiguchi.kyot...@lab.ntt.co.jp wrote:
 o   cahgqgwfyianahr_u_chnz-zourj3yqmmnjrr9rwp7vpsvxt...@mail.gmail.com
 20408.1404329...@sss.pgh.pa.us
 User-Agent: Mew version 6.5 on Emacs 22.2 / Mule 5.0
  =?iso-2022-jp?B?KBskQjpnGyhCKQ==?= / Meadow-3.01-dev (TSUBO-SUMIRE)
 Mime-Version: 1.0
 Content-Type: Text/Plain; charset=us-ascii
 Content-Transfer-Encoding: 7bit

 Hello, I'm getting confused.. The distinction between local_ and
 session_ seems to be obscure..

 At Wed, 02 Jul 2014 15:37:02 -0400, Tom Lane t...@sss.pgh.pa.us wrote in 
 20408.1404329...@sss.pgh.pa.us
 Well, there aren't that many PGC_BACKEND parameters.

 Two of them are log_connections and log_disconnections, which we've
 been arguing over whether they should be controllable by non-superusers
 in the first place.  The only other ones are ignore_system_indexes and
 post_auth_delay, which are debugging things that I can't see using with
 ALTER.  So this may be the only one where it's really of much interest.

 But I agree that the problem is triggered by the PGC_BACKEND categorization
 and not the meaning of this specific GUC.

 I put some thoughts on this.

 The current behavior is:

  - Considering setting them in postgresql.conf, the two are
different only in the restriction for the locaion of modules
to be load. But setting them is allowed only for superuser
equivalent(DBA) so the difference has no meaning.

  - Considering setting them on-session, only session_* can be
altered by superuser, but no valuable result would be
retrieved from that.

  - Considering setting them through pg_db_role_setting using
ALTER DATABASE/USER statements, only superuser can set only
session_* and both sessions for superuser and non-superuser
can perform it. local_* cannot be set anyway.

  - Considering inserting directly into pg_db_role_setting, only
superuser can insert any setting, including
local_preload_libraries, but it fails on session start.

  | =# INSERT INTO pg_db_role_setting VALUES(0, 16384, 
 '{local_preload_libraries=auto_explain}');
  | INSERT 0 1
  ...
  | $ psql postgres -U hoge
  | WARNING: parameter local_preload_libraries cannot be set after 
 connection start.

 After all, I suppose the desirable requirements utilizing the
 both *_preload_libraries are,

   - (local|session)_preload_libraries shouldn't be altered
 on-session.  (should have PGC_BACKEND)

   - ALTER ... SET local_preload_libraries should be done by any
 user, but specified modules should be within plugins
 directory.

   - ALTER ... SET session_preload_libraries should be set only by
 superuser, and modules in any directory can be specified.

   - Both (local|session)_preload_libraries should work at session
 start.

   - Direct setting of pg_db_role_setting by superuser allows
 arbitrary setting but by the superuser's own responsibility.

 The changes needed to achieve the above requirements are,

   - Now PGC_BACKEND and PGC_SUSET/USERSET are in orthogonal
 relationship, not in parallel. But it seems to big change for
 the fruits to reflect it in straightforward way. The new
 context PGC_BACKEND_USERSET seems to be enough.

   - set_config_option should allow PGC_BACKEND(_USERSET)
 variables to be checked (without changing) on-session.

   - PGC_BACKEND(_USERSET) variables should be allowed to be
 changed by set_config_option if teached that now is on
 session starting. Now changeVal is not a boolean but
 tri-state variable including
 (exec-on-session|exec-on-session-start|check-only)


 The above description is based on 9.4 and later. local_* would be
 applicable on 9.3 and before, but PGC_BACKEND_USERSET is not
 needed because they don't have session_preload_libraries.

 9.5dev apparently deserves to be applied. I personally think that
 applying to all live versions is desirable. But it seems to be a
 kind of feature change, enabling a function which cannot be used
 before..

 For 9.4, since session_preload_library have been introduced, the
 coexistence of current local_preload_library seems to be somewhat
 crooked. We might want to apply this for 9.4.

 For the earlier than 9.4, no one seems to have considered
 seriously to use local_preload_library via ALTER statements so
 far. Only document fix would be enough for them.


 Any suggestions?

 regards,

 --
 Kyotaro Horiguchi
 NTT Open Source Software Center


 --
 Sent via pgsql-hackers 

Re: [HACKERS] option -T in pg_basebackup doesn't work on windows

2014-08-14 Thread Amit Kapila
On Wed, Aug 13, 2014 at 9:20 PM, MauMau maumau...@gmail.com wrote:

 From: Amit Kapila amit.kapil...@gmail.com

 During my recent work on pg_basebackup, I noticed that
 -T option doesn't seem to work on Windows.
 The reason for the same is that while updating symlinks
 it doesn't consider that on Windows, junction points can
 be directories due to which it is not able to update the
 symlink location.
 Fix is to make the code work like symlink removal code
 in destroy_tablespace_directories.  Attached patch fixes
 problem.


 I could reproduce the problem on my Windows machine.

 The code change appears correct, but the patch application failed against
the latest source code.  I don't know why.  Could you confirm this?

 patching file src/bin/pg_basebackup/pg_basebackup.c
 Hunk #1 FAILED at 1119.
 1 out of 1 hunk FAILED -- saving rejects to file
src/bin/pg_basebackup/pg_basebackup.c.rej

It failed due to one of recent commits as mentioned by
Michael. Please find the rebased patch attached with this
mail

 On the following line, I think %d must be %u, because Oid is an unsigned
integer.

  char*linkloc = psprintf(%s/pg_tblspc/%d, basedir, oid);

Yeah, though this is not introduced by patch, but I think
this should be fixed and I have fixed it attached patch.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com


pg_basebackup_relocate_tablespace_v2.patch
Description: Binary data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] pg_shmem_allocations view

2014-08-14 Thread Michael Paquier
And here are some comments about patch 2:
- Patch applies with some hunks.
- Some typos are present
s#memory segments..#memory segments. (double dots)
s#NULL#literalNULL/ (in the docs as this refers to a value)
- Your thoughts about providing separate patches for each view? What
this patch does is straight-forward, but pg_shmem_allocations does not
actually depend on the first patch adding size and name to the dsm
fields. So IMO it makes sense to separate each feature properly.
- off should be renamed to offset for pg_get_shmem_allocations.
- Is it really worth showing unused shared memory? I'd rather rip out
the last portion of pg_get_shmem_allocations.
- For refcnt in pg_get_dynamic_shmem_allocations, could you add a
comment mentioning that refcnt = 1 means that the item is moribund and
0 is unused, and that reference count for active dsm segments only
begins from 2? I would imagine that this is enough, instead of using
some define's defining the ID from which a dsm item is considered as
active.

Regards,
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] improving speed of make check-world

2014-08-14 Thread Peter Eisentraut
make check-world creates a temporary installation in every subdirectory
it runs a test in, which is stupid: it's very slow and uses a lot of
disk space.  It's enough to do this once per run.  That is the essence
of what I have implemented.  It cuts the time for make check-world in
half or less, and it saves gigabytes of disk space.

The idea is that we only maintain one temporary installation under the
top-level directory.  By looking at the variable MAKELEVEL, we can tell
whether we are the top-level make invocation.  If so, make a temp
installation.  If not, we know that the upper layers have already done
it and we can just point to the existing temp installation.

I do this by ripping out the temp installation logic from pg_regress and
implementing it directly in the makefiles.  This is much simpler and has
additional potential benefits:

The pg_regress temp install mode is actually a combination of two
functionalities: temp install plus temp instance.  Until now, you could
only get the two together, but the temp instance functionality is
actually quite useful by itself.  It opens up the possibility of
implementing make check for external pgxs modules, for example.

Also, you could now run the temp installation step using parallel make,
making it even faster.  This was previously disabled because the make
flags would have to pass through pg_regress.  It still won't quite work
to run make check-world -j8, say, because we can't actually run the
tests in parallel (yet), but something like make -C src/test/regress
check -j8 should work.

To that end, I have renamed the pg_regress --temp-install option to
--temp-instance.  Since --temp-install is only used inside the source
tree, this shouldn't cause any compatibility problems.

MSVC build is not yet adjusted for this.  Looking at vcregress.pl, this
should be fairly straightforward.
diff --git a/.gitignore b/.gitignore
index 681af08..823d3ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,4 @@ lib*.pc
 /pgsql.sln.cache
 /Debug/
 /Release/
+/tmp_install/
diff --git a/GNUmakefile.in b/GNUmakefile.in
index 69e0824..5667943 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -47,6 +47,7 @@ $(call recurse,distprep,doc src config contrib)
 # it's not built by default
 $(call recurse,clean,doc contrib src config)
 clean:
+	rm -rf tmp_install/
 # Garbage from autoconf:
 	@rm -rf autom4te.cache/
 
@@ -61,6 +62,8 @@ distclean maintainer-clean:
 # Garbage from autoconf:
 	@rm -rf autom4te.cache/
 
+check-world: temp-install
+
 check check-tests: all
 
 check check-tests installcheck installcheck-parallel installcheck-tests:
diff --git a/contrib/earthdistance/Makefile b/contrib/earthdistance/Makefile
index 93dcbe3..cde1ae6 100644
--- a/contrib/earthdistance/Makefile
+++ b/contrib/earthdistance/Makefile
@@ -7,7 +7,7 @@ DATA = earthdistance--1.0.sql earthdistance--unpackaged--1.0.sql
 PGFILEDESC = earthdistance - calculate distances on the surface of the Earth
 
 REGRESS = earthdistance
-REGRESS_OPTS = --extra-install=contrib/cube
+EXTRA_INSTALL = contrib/cube
 
 LDFLAGS_SL += $(filter -lm, $(LIBS))
 
diff --git a/contrib/pg_upgrade/test.sh b/contrib/pg_upgrade/test.sh
index 7bbd2c7..7d493d9 100644
--- a/contrib/pg_upgrade/test.sh
+++ b/contrib/pg_upgrade/test.sh
@@ -80,7 +80,7 @@ if [ $1 = '--install' ]; then
 	# use psql from the proper installation directory, which might
 	# be outdated or missing. But don't override anything else that's
 	# already in EXTRA_REGRESS_OPTS.
-	EXTRA_REGRESS_OPTS=$EXTRA_REGRESS_OPTS --psqldir='$bindir'
+	EXTRA_REGRESS_OPTS=$EXTRA_REGRESS_OPTS --bindir='$bindir'
 	export EXTRA_REGRESS_OPTS
 fi
 
diff --git a/contrib/test_decoding/Makefile b/contrib/test_decoding/Makefile
index d7f32c3..6210ddb 100644
--- a/contrib/test_decoding/Makefile
+++ b/contrib/test_decoding/Makefile
@@ -39,35 +39,33 @@ submake-test_decoding:
 
 REGRESSCHECKS=ddl rewrite toast permissions decoding_in_xact binary prepared
 
-regresscheck: all | submake-regress submake-test_decoding
+regresscheck: all | submake-regress submake-test_decoding temp-install
 	$(MKDIR_P) regression_output
 	$(pg_regress_check) \
 	--temp-config $(top_srcdir)/contrib/test_decoding/logical.conf \
-	--temp-install=./tmp_check \
-	--extra-install=contrib/test_decoding \
+	--temp-instance=./tmp_check \
 	--outputdir=./regression_output \
 	$(REGRESSCHECKS)
 
-regresscheck-install-force: | submake-regress submake-test_decoding
+regresscheck-install-force: | submake-regress submake-test_decoding temp-install
 	$(pg_regress_installcheck) \
-	--extra-install=contrib/test_decoding \
 	$(REGRESSCHECKS)
 
 ISOLATIONCHECKS=mxact delayed_startup concurrent_ddl_dml
 
-isolationcheck: all | submake-isolation submake-test_decoding
+isolationcheck: all | submake-isolation submake-test_decoding temp-install
 	$(MKDIR_P) isolation_output
 	$(pg_isolation_regress_check) \
 	--temp-config $(top_srcdir)/contrib/test_decoding/logical.conf \
-