zeroshade commented on a change in pull request #11538: URL: https://github.com/apache/arrow/pull/11538#discussion_r744298951
########## File path: go/parquet/encryption_read_config_test.go ########## @@ -0,0 +1,443 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parquet_test + +import ( + "encoding/binary" + "fmt" + "os" + "path" + "testing" + + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/file" + "github.com/apache/arrow/go/parquet/internal/encryption" + "github.com/stretchr/testify/suite" +) + +/* + * This file contains a unit-test for reading encrypted Parquet files with + * different decryption configurations. + * + * The unit-test is called multiple times, each time to decrypt parquet files using + * different decryption configuration as described below. + * In each call two encrypted files are read: one temporary file that was generated using + * encryption_write_config_test.go test and will be deleted upon + * reading it, while the second resides in + * parquet-testing/data repository. Those two encrypted files were encrypted using the + * same encryption configuration. + * The encrypted parquet file names are passed as parameter to the unit-test. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The following decryption configurations are used to decrypt each parquet file: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + * - Decryption Configuration 4: PlainText Footer mode - test legacy reads, + * read the footer + all non-encrypted columns. + * (pairs with encryption configuration 3) + * + * The encrypted parquet files that is read was encrypted using one of the configurations + * below: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + */ + +func getDataDir() string { + datadir := os.Getenv("PARQUET_TEST_DATA") + if datadir == "" { + panic("please point the PARQUET_TEST_DATA environment variable to the test data dir") + } + return datadir +} + +type TestDecryptionSuite struct { + suite.Suite + + pathToDouble string + pathToFloat string + decryptionConfigs []*parquet.FileDecryptionProperties + footerEncryptionKey string + colEncryptionKey1 string + colEncryptionKey2 string + fileName string +} + +func (d *TestDecryptionSuite) TearDownSuite() { + os.Remove(tempdir) +} + +func TestFileEncryptionDecryption(t *testing.T) { + suite.Run(t, new(EncryptionConfigTestSuite)) + suite.Run(t, new(TestDecryptionSuite)) +} + +func (d *TestDecryptionSuite) SetupSuite() { + d.pathToDouble = "double_field" + d.pathToFloat = "float_field" + d.footerEncryptionKey = FooterEncryptionKey + d.colEncryptionKey1 = ColumnEncryptionKey1 + d.colEncryptionKey2 = ColumnEncryptionKey2 + d.fileName = FileName + + d.createDecryptionConfigs() +} + +func (d *TestDecryptionSuite) createDecryptionConfigs() { + // Decryption configuration 1: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + stringKr1 := make(encryption.StringKeyIDRetriever) + stringKr1.PutKey("kf", d.footerEncryptionKey) + stringKr1.PutKey("kc1", d.colEncryptionKey1) + stringKr1.PutKey("kc2", d.colEncryptionKey2) + + d.decryptionConfigs = append(d.decryptionConfigs, + parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr1))) + + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. + stringKr2 := make(encryption.StringKeyIDRetriever) + stringKr2.PutKey("kf", d.footerEncryptionKey) + stringKr2.PutKey("kc1", d.colEncryptionKey1) + stringKr2.PutKey("kc2", d.colEncryptionKey2) + d.decryptionConfigs = append(d.decryptionConfigs, + parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr2), parquet.WithDecryptAadPrefix(d.fileName))) + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + decryptCols := make(parquet.ColumnPathToDecryptionPropsMap) + decryptCols[d.pathToFloat] = parquet.NewColumnDecryptionProperties(d.pathToFloat, parquet.WithDecryptKey(d.colEncryptionKey2)) + decryptCols[d.pathToDouble] = parquet.NewColumnDecryptionProperties(d.pathToDouble, parquet.WithDecryptKey(d.colEncryptionKey1)) + d.decryptionConfigs = append(d.decryptionConfigs, + parquet.NewFileDecryptionProperties(parquet.WithFooterKey(d.footerEncryptionKey), parquet.WithColumnKeys(decryptCols))) + + // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext + // columns. + d.decryptionConfigs = append(d.decryptionConfigs, nil) +} + +func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) { + // if we get decryption_config_num = x then it means the actual number is x+1 + // and since we want decryption_config_num=4 we set the condition to 3 + props := parquet.NewReaderProperties(memory.DefaultAllocator) + if decryptConfigNum != 3 { + props.FileDecryptProps = d.decryptionConfigs[decryptConfigNum].Clone("") + } + + fileReader, err := file.OpenParquetFile(filename, false, file.WithReadProps(props)) + if err != nil { + panic(err) + } + defer fileReader.Close() + // get metadata + fileMetadata := fileReader.MetaData() + // get number of rowgroups + numRowGroups := len(fileMetadata.RowGroups) + // number of columns + numColumns := fileMetadata.Schema.NumColumns() + d.Equal(8, numColumns) + + for r := 0; r < numRowGroups; r++ { + rowGroupReader := fileReader.RowGroup(r) + + // get rowgroup meta + rgMeta := fileMetadata.RowGroup(r) + + valuesRead := 0 + rowsRead := int64(0) + + // get col reader for boolean column + colReader := rowGroupReader.Column(0) + boolReader := colReader.(*file.BooleanColumnChunkReader) + + // get column chunk metadata for boolean column + boolMd, _ := rgMeta.ColumnChunk(0) + + // Read all rows in column + i := 0 + for boolReader.HasNext() { + var val [1]bool + // read one value at a time. the number of rows read is returned. values + // read contains the number of non-null rows + rowsRead, valuesRead, _ = boolReader.ReadBatch(1, val[:], nil, nil) + // ensure only 1 value is read + d.EqualValues(1, rowsRead) + // there are no null values + d.EqualValues(1, valuesRead) + // verify the value + expected := i%2 == 0 + d.Equal(expected, val[0], "i: ", i) + i++ + } + d.EqualValues(i, boolMd.NumValues()) + + // Get column reader for int32 column + colReader = rowGroupReader.Column(1) + int32reader := colReader.(*file.Int32ColumnChunkReader) + + int32md, _ := rgMeta.ColumnChunk(1) + // Read all rows in column + i = 0 + for int32reader.HasNext() { + var val [1]int32 + // read one value at a time. the number of rows read is returned. values + // read contains the number of non-null rows + rowsRead, valuesRead, _ = int32reader.ReadBatch(1, val[:], nil, nil) + // ensure only 1 value is read + d.EqualValues(1, rowsRead) + // there are no null values + d.EqualValues(1, valuesRead) + // verify the value + d.EqualValues(i, val[0]) + i++ + } + d.EqualValues(i, int32md.NumValues()) + + // Get column reader for int64 column + colReader = rowGroupReader.Column(2) + int64reader := colReader.(*file.Int64ColumnChunkReader) + + int64md, _ := rgMeta.ColumnChunk(2) + // Read all rows in column + i = 0 + for int64reader.HasNext() { + var ( + val [1]int64 + def [1]int16 + rep [1]int16 + ) + + // read one value at a time. the number of rows read is returned. values + // read contains the number of non-null rows + rowsRead, valuesRead, _ = int64reader.ReadBatch(1, val[:], def[:], rep[:]) + // ensure only 1 value is read + d.EqualValues(1, rowsRead) + // there are no null values + d.EqualValues(1, valuesRead) + // verify the value + expectedValue := int64(i) * 1000 * 1000 * 1000 * 1000 + d.Equal(expectedValue, val[0]) + if i%2 == 0 { + d.EqualValues(1, rep[0]) + } else { + d.Zero(rep[0]) + } + i++ + } + d.EqualValues(i, int64md.NumValues()) + + // Get column reader for int96 column + colReader = rowGroupReader.Column(3) + int96reader := colReader.(*file.Int96ColumnChunkReader) + + int96md, _ := rgMeta.ColumnChunk(3) + // Read all rows in column + i = 0 + for int96reader.HasNext() { + var ( + val [1]parquet.Int96 + ) + + // read one value at a time. the number of rows read is returned. values + // read contains the number of non-null rows + rowsRead, valuesRead, _ = int96reader.ReadBatch(1, val[:], nil, nil) + // ensure only 1 value is read + d.EqualValues(1, rowsRead) + // there are no null values + d.EqualValues(1, valuesRead) + // verify the value + var expectedValue parquet.Int96 + binary.LittleEndian.PutUint32(expectedValue[:4], uint32(i)) + binary.LittleEndian.PutUint32(expectedValue[4:], uint32(i+1)) + binary.LittleEndian.PutUint32(expectedValue[8:], uint32(i+2)) + d.Equal(expectedValue, val[0]) + i++ + } + d.EqualValues(i, int96md.NumValues()) + + if decryptConfigNum != 3 { + // Get column reader for the float column + colReader = rowGroupReader.Column(4) + floatReader := colReader.(*file.Float32ColumnChunkReader) + + floatmd, _ := rgMeta.ColumnChunk(4) + + i = 0 + for floatReader.HasNext() { + var value [1]float32 + // read one value at a time. the number of rows read is returned. values + // read contains the number of non-null rows + rowsRead, valuesRead, _ = floatReader.ReadBatch(1, value[:], nil, nil) + // ensure only 1 value is read + d.EqualValues(1, rowsRead) + // there are no null values + d.EqualValues(1, valuesRead) + // verify the value + expectedValue := float32(i) * 1.1 + d.Equal(expectedValue, value[0]) + i++ + } + d.EqualValues(i, floatmd.NumValues()) + + // Get column reader for the double column + colReader = rowGroupReader.Column(5) + dblReader := colReader.(*file.Float64ColumnChunkReader) + + dblmd, _ := rgMeta.ColumnChunk(5) + + i = 0 + for dblReader.HasNext() { + var value [1]float64 + // read one value at a time. the number of rows read is returned. values + // read contains the number of non-null rows + rowsRead, valuesRead, _ = dblReader.ReadBatch(1, value[:], nil, nil) + // ensure only 1 value is read + d.EqualValues(1, rowsRead) + // there are no null values + d.EqualValues(1, valuesRead) + // verify the value + expectedValue := float64(i) * 1.1111111 + d.Equal(expectedValue, value[0]) + i++ + } + d.EqualValues(i, dblmd.NumValues()) + } + + colReader = rowGroupReader.Column(6) + bareader := colReader.(*file.ByteArrayColumnChunkReader) + + bamd, _ := rgMeta.ColumnChunk(6) + + i = 0 + for bareader.HasNext() { + var value [1]parquet.ByteArray + var def [1]int16 + + rowsRead, valuesRead, _ := bareader.ReadBatch(1, value[:], def[:], nil) + d.EqualValues(1, rowsRead) + expected := [10]byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0} + expected[7] = byte('0') + byte(i/100) + expected[8] = byte('0') + byte(i/10)%10 + expected[9] = byte('0') + byte(i%10) + if i%2 == 0 { + d.Equal(1, valuesRead) Review comment: The lines here: https://github.com/apache/arrow/pull/11538/files/61ce47474d7ec42a80f1d6b354fbc28f4914edab#diff-cef277f99a3822e591750fc26b274b33e7be004a21833b2f38f5c438c1069178R382 and here: https://github.com/apache/arrow/pull/11538/files/61ce47474d7ec42a80f1d6b354fbc28f4914edab#diff-cef277f99a3822e591750fc26b274b33e7be004a21833b2f38f5c438c1069178R391 Verify that a panic happens when attempting to read from an encrypted file without proper keys. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
