[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r147172071 ## File path: core/src/main/java/org/apache/accumulo/core/security/crypto/RFileCipherOutputStream.java ## @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * see the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.accumulo.core.security.crypto; + +import java.io.IOException; +import java.io.OutputStream; + +import javax.crypto.Cipher; +import javax.crypto.CipherOutputStream; + +/** + * + * This class extends {@link CipherOutputStream} to include a way to track the number of bytes that have + * been encrypted by the stream. The write method also includes a mechanism to stop writing and + * throw an exception if exceeding a maximum number of bytes is attempted. + * + */ +public class RFileCipherOutputStream extends CipherOutputStream { + + // This is the maximum size encrypted stream that can be written. Attempting to write anything larger + // will cause an exception. Given that each block in an rfile is encrypted separately, and blocks + // should be written such that a block cannot ever reach 16GiB, this is believed to be a safe number. + // If this does cause an exception, it is an issue best addressed elsewhere. + private final long maxOutputSize = 1L << 34; //16GiB + + // the cipher engine to use to process stream data + private Cipher cipher; + + // the underlying output stream + private OutputStream output; + + // the buffer holding data ready to be written out + private byte[] obuffer; + + // The total number of bytes that have been written out + private long count = 0; + + /** + * + * Constructs a RFileCipherOutputStream + * + * @param os + * the OutputStream object + * @param c + * an initialized Cipher object + */ + public RFileCipherOutputStream(OutputStream os, Cipher c) { +super(os, c); +output = os; +cipher = c; + } + + /** + * Override of CipherOutputStream's write to count the number of bytes that have been encrypted. + * This method now throws an exception if an attempt to write bytes beyond a maximum is made. + * + * Writes len bytes from the specified byte array starting at offset off to this output stream. + * + * @param b + * the data. + * @param off + * the start offset in the data. + * @param len + * the number of bytes to write. + * @exception IOException + * if an I/O error occurs. + * @since JCE1.2 + */ + @Override + public void write(byte b[], int off, int len) throws IOException { +count += len; +if (count > maxOutputSize) { + throw new IOException("Attempt to write " + count + " bytes was made. A maximum of " + maxOutputSize + " is allowed for an encryption stream."); +} +obuffer = cipher.update(b, off, len); Review comment: can you call super.write here? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139749197 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -118,6 +118,9 @@ private RFile() {} // Buffer sample data so that many sample data blocks are stored contiguously. private static int sampleBufferSize = 1000; + // 5 bytes of overhead for fields: value, row, colFamily, colQualifier, colVisibility, and 8 for the timestamp Review comment: would nice to mention what the overhead is... its worst case serialization overhead This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139748937 ## File path: core/src/main/java/org/apache/accumulo/core/data/Key.java ## @@ -1218,9 +1219,9 @@ public Object clone() throws CloneNotSupportedException { * Throws an exception if the key is too large. * */ - public void sanityCheckKey() { -// If fails the check -if (((long) this.row.length + (long) this.colFamily.length + (long) this.colQualifier.length + (long) this.colVisibility.length + (4L * 5L + 8L)) >= Integer.MAX_VALUE) { + private void sanityCheckKey() { + //If the key is too large, we throw an exception. We subtract 5L from KEY_VALUE_OVERHEAD to remove the accounting for value overhead +if (((long) this.row.length + (long) this.colFamily.length + (long) this.colQualifier.length + (long) this.colVisibility.length + (RFile.KEY_VALUE_OVERHEAD-5L)) >= Integer.MAX_VALUE) { Review comment: could have two constants in RFile : `KEY_OVERHEAD` and `VALUE_OVERHEAD` then the subtraction of `5L` is not needed here, just use `KEY_OVERHEAD` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139695797 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -426,6 +426,12 @@ public void append(Key key, Value value) throws IOException { throw new IllegalArgumentException("Keys appended out-of-order. New key " + key + ", previous key " + prevKey); } + // 5 bytes of overhead for fields: value, row, colFamily, colQualifier, colVisibility, and 8 for the long timestamp + if (((long) key.getSize() + (long) value.getSize() + (5L * 5L + 8L)) >= Integer.MAX_VALUE) { Review comment: Could make `(5L * 5L + 8L)` a constant in RFile and then Key could also use that constant. The constant could have a comment about its purpose then the code that uses it does not need that comment. I am thinking it would be better to put the constant in RFile instead of Key because Key is in the public API, so the constant would end up in the API if in Key. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139695797 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -426,6 +426,12 @@ public void append(Key key, Value value) throws IOException { throw new IllegalArgumentException("Keys appended out-of-order. New key " + key + ", previous key " + prevKey); } + // 5 bytes of overhead for fields: value, row, colFamily, colQualifier, colVisibility, and 8 for the long timestamp + if (((long) key.getSize() + (long) value.getSize() + (5L * 5L + 8L)) >= Integer.MAX_VALUE) { Review comment: Could make `(5L * 5L + 8L)` a constant in RFile and then Key could also use that constant. The constant could have a comment about its purpose then the code that uses it does not need that constant. I am thinking it would be better to put the constant in RFile instead of Key because Key is in the public API, so the constant would end up in the API if in Key. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139694924 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -426,6 +426,12 @@ public void append(Key key, Value value) throws IOException { throw new IllegalArgumentException("Keys appended out-of-order. New key " + key + ", previous key " + prevKey); } + // 5 bytes of overhead for fields: value, row, colFamily, colQualifier, colVisibility, and 8 for the long timestamp + if (((long) key.getSize() + (long) value.getSize() + (5L * 5L + 8L)) >= Integer.MAX_VALUE) { +throw new IllegalArgumentException("Key/value pair is too large (" + ((long) key.getSize() + (long) value.getSize()) Review comment: I would separate the key and val sizes in the message... something like the following with a space between... gives a little more info. ``` throw new IllegalArgumentException("Key/value pair is too large (" + ((long) key.getSize() +" " + (long) value.getSize()) ``` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139693814 ## File path: core/src/main/java/org/apache/accumulo/core/data/Key.java ## @@ -1207,4 +1209,21 @@ public Object clone() throws CloneNotSupportedException { r.colVisibility = Arrays.copyOf(colVisibility, colVisibility.length); return r; } + + /** + * Checks the max size of a key because external operations, such as writing to RFiles, require the key size fits within [0,Integer.MAX_VALUE] + * + * Accounts for 5 bytes of overhead for each array and 8 bytes of overhead for the timestamp. + * + * Throws an exception if the key is too large. + * + */ + public void sanityCheckKey() { Review comment: why make this public and add it to the API? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139479400 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -426,6 +426,10 @@ public void append(Key key, Value value) throws IOException { throw new IllegalArgumentException("Keys appended out-of-order. New key " + key + ", previous key " + prevKey); } + if (((long) key.getSize() + (long) value.getSize()) >= Integer.MAX_VALUE) { +throw new IllegalArgumentException("Key/value pair is too large to be appended to RFile."); Review comment: Anything we can do to help someone on the receiving end of this message debug it would be good. We could include the size of key+value in the error message. Also, Key.toString() truncates anything that is too long so we could leverage that possibly include the key. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139481475 ## File path: core/src/main/java/org/apache/accumulo/core/data/Key.java ## @@ -101,6 +101,9 @@ private final void init(byte r[], int rOff, int rLen, byte cf[], int cfOff, int colVisibility = copyIfNeeded(cv, cvOff, cvLen, copy); timestamp = ts; deleted = del; +if (!sanityCheckKey()) { + throw new IllegalArgumentException("Invalid Key entry. Key exceeds " + Integer.MAX_VALUE + " bytes."); Review comment: Would be nice to include the size. Also why not have the sanityCheck method throw the exception? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139480306 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -426,6 +426,10 @@ public void append(Key key, Value value) throws IOException { throw new IllegalArgumentException("Keys appended out-of-order. New key " + key + ", previous key " + prevKey); } + if (((long) key.getSize() + (long) value.getSize()) >= Integer.MAX_VALUE) { Review comment: Serialization will also add overhead. For example some bytes encoding the length of the row are written before the rows data. So this means that what can really be written is less than what is checked for here. Not sure if we want to try to account for that here. We could do something simple like assume each field will need at least 5 bytes for the length (I think 5 is the max because we are using var int). Even if not accounting for that serialization overhead this is still a really nice check. It will catch most situations at the source leaving a small range of key+val size that would only be caught later when encoding the block. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB
keith-turner commented on a change in pull request #300: ACCUMULO-4708 Limit RFile block size to 2GB URL: https://github.com/apache/accumulo/pull/300#discussion_r139481135 ## File path: core/src/main/java/org/apache/accumulo/core/file/rfile/RFile.java ## @@ -441,19 +445,27 @@ public void append(Key key, Value value) throws IOException { } else if (blockWriter.getRawSize() > blockSize) { // Look for a key thats short to put in the index, defining short as average or below. -if (avergageKeySize == 0) { +if (averageKeySize == 0) { // use the same average for the search for a below average key for a block - avergageKeySize = keyLenStats.getMean(); + averageKeySize = keyLenStats.getMean(); } // Possibly produce a shorter key that does not exist in data. Even if a key can be shortened, it may not be below average. Key closeKey = KeyShortener.shorten(prevKey, key); -if ((closeKey.getSize() <= avergageKeySize || blockWriter.getRawSize() > maxBlockSize) && !isGiantKey(closeKey)) { +if ((closeKey.getSize() <= averageKeySize || blockWriter.getRawSize() > maxBlockSize) && !isGiantKey(closeKey)) { closeBlock(closeKey, false); blockWriter = fileWriter.prepareDataBlock(); // set average to zero so its recomputed for the next block - avergageKeySize = 0; + averageKeySize = 0; + // If the block reaches or exceeds 2GB, it has no way to determine the amount of data actually written. To prevent + // this, we check to see if adding the key/value will create a problem, and if it will, we force a transition to + // the next block. +} else if (((long) key.getSize() + (long) value.getSize() + blockWriter.getRawSize()) >= Integer.MAX_VALUE) { Review comment: may want to consider adding some arbitrary overhead for serialization metadata here... like 128 bytes. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services