Re: Compressing large column by moving it to a unique table

Adrian Garcia Badaracco Tue, 13 Feb 2024 09:03:18 -0800

I'm using PostgreSQL 15.5.

Here's a self-contained example. I included links to public pgMustard query
plans.


Gist link: https://gist.github.com/adriangb/05a01cca99a438de2a286945903c44f4

Also copied below for archiving:

```sql
CREATE OR REPLACE FUNCTION random_bytes(length integer)
RETURNS bytea AS $$
DECLARE
  bytes bytea := '';
  i integer := 0;
BEGIN
  -- generate 1024 bytes at a time using gen_random_bytes(1024)
  WHILE i < length LOOP
    bytes := bytes || gen_random_bytes(least(1024, length - i));
    i := i + 1024;
  END LOOP;

  RETURN bytes;
END;
$$ LANGUAGE plpgsql;

DROP TABLE IF EXISTS original, original_hashed, dictionary_table CASCADE;
DROP VIEW IF EXISTS joined_view;

CREATE TABLE original AS (
  WITH large_random_bytes AS (
    SELECT n AS id, random_bytes(4096 + n) AS attributes
    FROM generate_series(1, 1000) n
  )
  SELECT
    -- An incrementing timestamp
    '2024-01-01 00:00:00.00+00'::timestamptz + (n || ' seconds')::interval
AS start_timestamp,
    -- Another relatively small column, let's just make it a random string
including `n` to make it unique
    'random_string_' || (n % 100) AS event_name,
    -- The attributes column
    lrb.attributes AS attributes
    FROM generate_series(0, 1000000) n
    JOIN large_random_bytes lrb ON n % 1000 = lrb.id
);
CREATE INDEX ON original(start_timestamp);

CREATE TABLE dictionary_table AS (
  SELECT DISTINCT time_bucket('1 day', start_timestamp) AS
start_timestamp_range, attributes, md5(attributes) AS hash
  FROM original
);
CREATE INDEX ON dictionary_table (start_timestamp_range, hash);

CREATE TABLE original_hashed AS (
  SELECT
    start_timestamp,
    event_name,
    md5(attributes) AS hash
  FROM original
);
CREATE INDEX ON original_hashed (start_timestamp, time_bucket('1 day',
start_timestamp), hash);


CREATE VIEW joined_view AS (
  SELECT
    original_hashed.start_timestamp,
    original_hashed.event_name,
    dictionary_table.attributes
  FROM original_hashed
  LEFT JOIN dictionary_table ON (
    time_bucket('1 day', original_hashed.start_timestamp) =
dictionary_table.start_timestamp_range
    AND
    original_hashed.hash = dictionary_table.hash
  )
);

-- Select all data
explain (analyze, buffers, verbose, settings, format json)
SELECT *
FROM joined_view
WHERE (
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/f8b3b7d5-c236-4326-9a67-20fa4cdff4bc

explain (analyze, buffers, verbose, settings, format json)
SELECT *
FROM original
WHERE (
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/bcad3767-b8aa-4a37-acc7-7667266f29be

-- A relatively aggregation selective query
explain (analyze, buffers, verbose, settings, format json)
SELECT count(*)
FROM joined_view
WHERE (
  get_byte(attributes, 4) < 100
  AND
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/c5b68839-4c66-4b14-8de5-757bfc22fdb3

explain (analyze, buffers, verbose, settings, format json)
SELECT count(*)
FROM original
WHERE (
  get_byte(attributes, 4) < 100
  AND
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/831d9c4c-e911-4caf-a6c3-eee5343e44d8

-- Select only some columns
explain (analyze, buffers, verbose, settings, format json)
SELECT start_timestamp
FROM joined_view
WHERE (
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/5f617bfa-7816-4f8d-9fd7-24388c08978c

explain (analyze, buffers, verbose, settings, format json)
SELECT start_timestamp
FROM original
WHERE (
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/f05de3e8-ca8d-4404-ad33-69f103a3b85a


-- A relatively selective query
explain (analyze, buffers, verbose, settings, format json)
SELECT *
FROM joined_view
WHERE (
  get_byte(attributes, 4) < 100
  AND
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/a75b20bd-07d5-4402-a0d8-22419682307a

explain (analyze, buffers, verbose, settings, format json)
SELECT *
FROM original
WHERE (
  get_byte(attributes, 4) < 100
  AND
  start_timestamp > ('2024-01-01 00:00:00.00+00'::timestamptz + interval '1
day')
  AND
  start_timestamp < ('2024-01-01 00:00:00.00+00'::timestamptz + interval '2
day')
);
-- https://app.pgmustard.com/#/explore/47ef84f0-a96e-4baa-af40-2ec241cbb6e2
```

On Tue, Feb 13, 2024 at 11:04 AM Ron Johnson <ronljohnso...@gmail.com>
wrote:

> 1. Show us the PG version, view definition, the exact query that's slow,
> and the EXPLAIN (ANALYZE).
> 2. Presumably there's an index on each table's *_hash column?
>
> On Tue, Feb 13, 2024 at 8:48 AM Adrian Garcia Badaracco <
> adr...@adriangb.com> wrote:
>
>> Thank you for the reply Ron.
>>
>> Yes there are many fewer (<1%) the number of rows in new_table.
>>
>> Thanks for making me think of normalization, I hadn’t seen it that way.
>> Although there is no theoretical relationship between the rows in the other
>> columns in the original table and the attributes column, in practice there
>> is a strong correlation, so I guess what I am trying to capture here is
>> taking advantage of that correlation, while not completely depending on it
>> because it can be broken.
>>
>> In any case, whatever theoretical framework is put around this solution,
>> I am also interested in the practical aspects, in particular that case of
>> selecting a subset of columns from the view that I know doesn’t need the
>> join but the query planner thinks does.
>>
>> On Tue, Feb 13, 2024 at 3:16 AM Ron Johnson <ronljohnso...@gmail.com>
>> wrote:
>>
>>> On Mon, Feb 12, 2024 at 10:12 PM Adrian Garcia Badaracco <
>>> adr...@adriangb.com> wrote:
>>>
>>>> I am using Timescale so I'll be mentioning some timestamp stuff but I
>>>> think this is a general postgres question for the most part.
>>>>
>>>> I have a table with some fixed, small columns (id, timestamp, etc) and
>>>> a large JSONB column (let's call it `attributes`). `attributes` has 1000s
>>>> of schemas, but given a schema, there's a lot of duplication. Across all
>>>> rows, more than 99% of the data is duplicated (as measured by
>>>> `count(attributes)` vs `count(distinct attributes)`.
>>>>
>>>> I can't normalize `attributes` into real columns because it is quite
>>>> variable (remember 1000s of schemas).
>>>>
>>>> My best idea is to make a table like `(day timestamptz, hash text,
>>>> attributes jsonb)` and then in my original table replace `attributes` with
>>>> a reference to `new_table`.
>>>>
>>>
>>> Meaning that there are many fewer rows in new_table?
>>>
>>>
>>>> I can then make a view that joins them `select
>>>> original_table.timestamp, new_table.attributes from original join new_table
>>>> on (time_bucket('1 day', timestamp) = day AND original.hash =
>>>> new_table.hash)` or something like that. The idea of time bucketing into 1
>>>> day is to balance write and read speed (by relying on timescale to do
>>>> efficient time partitioning, data retention, etc.).
>>>>
>>>
>>>> I recognize this is essentially creating a key-value store in postgres
>>>> and also janky compression, so I am cautious about it.
>>>>
>>>
>>> If my interpretation (that there are many fewer rows in new_table) is
>>> correct, then you've stumbled into the Second Normal Form of database
>>> design: https://en.wikipedia.org/wiki/Second_normal_form#Example
>>>
>>>

Re: Compressing large column by moving it to a unique table

Reply via email to