goller 2004/09/30 05:40:28 Modified: src/java/org/apache/lucene/index FieldsReader.java FieldsWriter.java src/java/org/apache/lucene/document Field.java src/test/org/apache/lucene/document TestDocument.java Added: src/test/org/apache/lucene/document TestBinaryDocument.java Log: Allow stored fields to be compressed (see Bug#31149) Revision Changes Path 1.11 +60 -9 jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java Index: FieldsReader.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- FieldsReader.java 16 Sep 2004 21:13:37 -0000 1.10 +++ FieldsReader.java 30 Sep 2004 12:40:26 -0000 1.11 @@ -16,7 +16,10 @@ * limitations under the License. */ +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -66,29 +69,77 @@ FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); byte bits = fieldsStream.readByte(); - - if ((bits & 2) != 0) { + + boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; + boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; + + if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0) { final byte[] b = new byte[fieldsStream.readVInt()]; fieldsStream.readBytes(b, 0, b.length); - doc.add(new Field(fi.name, b)); + if (compressed) + doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS)); + else + doc.add(new Field(fi.name, b, Field.Store.YES)); } else { Field.Index index; - boolean tokenize = (bits & 1) != 0; + Field.Store store = Field.Store.YES; + if (fi.isIndexed && tokenize) index = Field.Index.TOKENIZED; else if (fi.isIndexed && !tokenize) index = Field.Index.UN_TOKENIZED; else index = Field.Index.NO; - doc.add(new Field(fi.name, // name - fieldsStream.readString(), // read value - Field.Store.YES, index, - fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + + if (compressed) { + store = Field.Store.COMPRESS; + final byte[] b = new byte[fieldsStream.readVInt()]; + fieldsStream.readBytes(b, 0, b.length); + doc.add(new Field(fi.name, // field name + new String(uncompress(b), "UTF-8"), // uncompress the value and add as string + store, + index, + fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + } + else + doc.add(new Field(fi.name, // name + fieldsStream.readString(), // read value + store, + index, + fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); } } return doc; } + private final byte[] uncompress(final byte[] input) + throws IOException + { + + Inflater decompressor = new Inflater(); + decompressor.setInput(input); + + // Create an expandable byte array to hold the decompressed data + ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); + + // Decompress the data + byte[] buf = new byte[1024]; + while (!decompressor.finished()) { + try { + int count = decompressor.inflate(buf); + bos.write(buf, 0, count); + } + catch (DataFormatException e) { + // this will happen if the field is not compressed + throw new IOException ("field data are in wrong format: " + e.toString()); + } + } + + decompressor.end(); + + // Get the decompressed data + return bos.toByteArray(); + } } 1.6 +62 -5 jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java Index: FieldsWriter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- FieldsWriter.java 28 Sep 2004 18:15:52 -0000 1.5 +++ FieldsWriter.java 30 Sep 2004 12:40:26 -0000 1.6 @@ -16,8 +16,10 @@ * the License. */ +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Enumeration; +import java.util.zip.Deflater; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -26,6 +28,10 @@ final class FieldsWriter { + static final short FIELD_IS_TOKENIZED = 1; + static final short FIELD_IS_BINARY = 2; + static final short FIELD_IS_COMPRESSED = 4; + private FieldInfos fieldInfos; private IndexOutput fieldsStream; @@ -63,21 +69,72 @@ byte bits = 0; if (field.isTokenized()) - bits |= 1; + bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.isBinary()) - bits |= 2; + bits |= FieldsWriter.FIELD_IS_BINARY; + if (field.isCompressed()) + bits |= FieldsWriter.FIELD_IS_COMPRESSED; + fieldsStream.writeByte(bits); - - if (field.isBinary()) { + + if (field.isCompressed()) { + // compression is enabled for the current field + byte[] data = null; + // check if it is a binary field + if (field.isBinary()) { + data = compress(field.binaryValue()); + } + else { + data = compress(field.stringValue().getBytes("UTF-8")); + } + final int len = data.length; + fieldsStream.writeVInt(len); + fieldsStream.writeBytes(data, len); + } + else { + // compression is disabled for the current field + if (field.isBinary()) { byte[] data = field.binaryValue(); final int len = data.length; fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, len); - } else { + } + else { fieldsStream.writeString(field.stringValue()); + } } } } } + private final byte[] compress (byte[] input) { + + // Create the compressor with highest level of compression + Deflater compressor = new Deflater(); + compressor.setLevel(Deflater.BEST_COMPRESSION); + + // Give the compressor the data to compress + compressor.setInput(input); + compressor.finish(); + + /* + * Create an expandable byte array to hold the compressed data. + * You cannot use an array that's the same size as the orginal because + * there is no guarantee that the compressed data will be smaller than + * the uncompressed data. + */ + ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); + + // Compress the data + byte[] buf = new byte[1024]; + while (!compressor.finished()) { + int count = compressor.deflate(buf); + bos.write(buf, 0, count); + } + + compressor.end(); + + // Get the compressed data + return bos.toByteArray(); + } } 1.24 +62 -27 jakarta-lucene/src/java/org/apache/lucene/document/Field.java Index: Field.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v retrieving revision 1.23 retrieving revision 1.24 diff -u -r1.23 -r1.24 --- Field.java 15 Sep 2004 12:50:22 -0000 1.23 +++ Field.java 30 Sep 2004 12:40:26 -0000 1.24 @@ -33,15 +33,16 @@ public final class Field implements java.io.Serializable { private String name = "body"; - private String stringValue = null; - private Reader readerValue = null; - private byte[] binaryValue = null; + + // the one and only data object for all different kind of field values + private Object fieldsData = null; private boolean storeTermVector = false; private boolean isStored = false; private boolean isIndexed = true; private boolean isTokenized = true; private boolean isBinary = false; + private boolean isCompressed = false; private float boost = 1.0f; @@ -54,6 +55,10 @@ public String toString() { return name; } + /** Store the original field value in the index in a compressed form. This is + * useful for long documents and for binary valued fields. + */ + public static final Store COMPRESS = new Store("COMPRESS"); /** Store the original field value in the index. This is useful for short texts * like a document's title which should be displayed with the results. The * value is stored in its original form, i.e. no analyzer is used before it is @@ -220,18 +225,22 @@ /** The name of the field (e.g., "date", "title", "body", ...) as an interned string. */ - public String name() { return name; } + public String name() { return name; } /** The value of the field as a String, or null. If null, the Reader value - is used. Exactly one of stringValue() and readerValue() must be set. */ - public String stringValue() { return stringValue; } + * or binary value is used. Exactly one of stringValue(), readerValue(), and + * binaryValue() must be set. */ + public String stringValue() { try { return (String)fieldsData; } catch (ClassCastException ignore) { return null; } } + /** The value of the field as a Reader, or null. If null, the String value - is used. Exactly one of stringValue() and readerValue() must be set. */ - public Reader readerValue() { return readerValue; } + * or binary value is used. Exactly one of stringValue(), readerValue(), + * and binaryValue() must be set. */ + public Reader readerValue() { try { return (Reader)fieldsData; } catch (ClassCastException ignore) { return null; } } + /** The value of the field in Binary, or null. If null, the Reader or - String value is used. Exactly one of stringValue(), readerValue() and - binaryValue() must be set. */ - public byte[] binaryValue() { return binaryValue; } + * String value is used. Exactly one of stringValue(), readerValue() and + * binaryValue() must be set. */ + public byte[] binaryValue() { try { return (byte[])fieldsData; } catch (ClassCastException ignore) { return null; } } /** * Create a field by specifying its name, value and how it will @@ -277,12 +286,16 @@ if (index == Index.NO && termVector != TermVector.NO) throw new IllegalArgumentException("cannot store term vector information " + "for a field that is not indexed"); - + this.name = name.intern(); // field names are interned - this.stringValue = value; + this.fieldsData = value; if (store == Store.YES) this.isStored = true; + else if (store == Store.COMPRESS) { + this.isStored = true; + this.isCompressed = true; + } else if (store == Store.NO) this.isStored = false; else @@ -331,7 +344,7 @@ if (reader == null) throw new NullPointerException("reader cannot be null"); this.name = name.intern(); // field names are interned - this.readerValue = reader; + this.fieldsData = reader; this.isStored = false; this.isIndexed = true; this.isTokenized = true; @@ -344,18 +357,31 @@ * @deprecated use [EMAIL PROTECTED] #Field(String, String, Field.Store, Field.Index)} instead */ public Field(String name, String string, - boolean store, boolean index, boolean token) { + boolean store, boolean index, boolean token) { this(name, string, store, index, token, false); } - public Field(String name, byte[] value) { + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + * @param store How <code>value</code> should be stored (compressed or not.) + */ + public Field(String name, byte[] value, Store store) { if (name == null) throw new IllegalArgumentException("name cannot be null"); if (value == null) throw new IllegalArgumentException("value cannot be null"); + if (store == Store.NO) + throw new IllegalArgumentException("binary values can't be unstored"); + if (store == Store.COMPRESS) + this.isCompressed = true; this.name = name.intern(); - this.binaryValue = value; + //wrap the byte[] to a ByteBuffer object + this.fieldsData = value; this.isBinary = true; this.isStored = true; @@ -377,7 +403,7 @@ * @deprecated use [EMAIL PROTECTED] #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead */ public Field(String name, String string, - boolean store, boolean index, boolean token, boolean storeTermVector) { + boolean store, boolean index, boolean token, boolean storeTermVector) { if (name == null) throw new NullPointerException("name cannot be null"); if (string == null) @@ -385,8 +411,8 @@ if (!index && storeTermVector) throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed"); - this.name = name.intern(); // field names are interned - this.stringValue = string; + this.name = name.intern(); // field names are interned + this.fieldsData = string; this.isStored = store; this.isIndexed = index; this.isTokenized = token; @@ -406,16 +432,19 @@ /** True iff the value of the field is to be stored in the index for return with search hits. It is an error for this to be true if a field is Reader-valued. */ - public final boolean isStored() { return isStored; } + public final boolean isStored() { return isStored; } /** True iff the value of the field is to be indexed, so that it may be searched on. */ - public final boolean isIndexed() { return isIndexed; } + public final boolean isIndexed() { return isIndexed; } /** True iff the value of the field should be tokenized as text prior to indexing. Un-tokenized fields are indexed as a single word and may not be Reader-valued. */ - public final boolean isTokenized() { return isTokenized; } + public final boolean isTokenized() { return isTokenized; } + + /** True if the value of the field is stored and compressed within the index */ + public final boolean isCompressed() { return isCompressed; } /** True iff the term or terms used to index this field are stored as a term * vector, available from [EMAIL PROTECTED] IndexReader#getTermFreqVector(int,String)}. @@ -456,14 +485,20 @@ result.append("binary"); } + if (isCompressed) { + if (result.length() > 0) + result.append(","); + result.append("compressed"); + } + result.append('<'); result.append(name); result.append(':'); - if (readerValue != null) { - result.append(readerValue.toString()); - } else { - result.append(stringValue); + + if (fieldsData != null) { + result.append(fieldsData); } + result.append('>'); return result.toString(); } 1.9 +3 -3 jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java Index: TestDocument.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- TestDocument.java 15 Sep 2004 12:50:23 -0000 1.8 +++ TestDocument.java 30 Sep 2004 12:40:28 -0000 1.9 @@ -47,8 +47,8 @@ { Document doc = new Document(); Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO); - Field binaryFld = new Field("binary", binaryVal.getBytes()); - Field binaryFld2 = new Field("binary", binaryVal2.getBytes()); + Field binaryFld = new Field("binary", binaryVal.getBytes(), Field.Store.YES); + Field binaryFld2 = new Field("binary", binaryVal2.getBytes(), Field.Store.YES); doc.add(stringFld); doc.add(binaryFld); 1.1 jakarta-lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java Index: TestBinaryDocument.java =================================================================== package org.apache.lucene.document; import junit.framework.TestCase; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.RAMDirectory; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Tests [EMAIL PROTECTED] Document} class. * * @author Bernhard Messer * @version $Id: TestBinaryDocument.java,v 1.1 2004/09/30 12:40:28 goller Exp $ */ public class TestBinaryDocument extends TestCase { String binaryValStored = "this text will be stored as a byte array in the index"; String binaryValCompressed = "this text will be also stored and compressed as a byte array in the index"; public void testBinaryFieldInIndex() throws Exception { Field binaryFldStored = new Field("binaryStored", binaryValStored.getBytes(), Field.Store.YES); Field binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS); Field stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO); Field stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO); try { // binary fields with store off are not allowed new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO); fail(); } catch (IllegalArgumentException iae) { ; } Document doc = new Document(); doc.add(binaryFldStored); doc.add(binaryFldCompressed); doc.add(stringFldStored); doc.add(stringFldCompressed); /** test for field count */ assertEquals(4, doc.fields.size()); /** add the doc to a ram index */ RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true); writer.addDocument(doc); writer.close(); /** open a reader and fetch the document */ IndexReader reader = IndexReader.open(dir); Document docFromReader = reader.document(0); assertTrue(docFromReader != null); /** fetch the binary stored field and compare it's content with the original one */ String binaryFldStoredTest = new String(docFromReader.getBinaryValue("binaryStored")); assertTrue(binaryFldStoredTest.equals(binaryValStored)); /** fetch the binary compressed field and compare it's content with the original one */ String binaryFldCompressedTest = new String(docFromReader.getBinaryValue("binaryCompressed")); assertTrue(binaryFldCompressedTest.equals(binaryValCompressed)); /** fetch the string field and compare it's content with the original one */ String stringFldStoredTest = new String(docFromReader.get("stringStored")); assertTrue(stringFldStoredTest.equals(binaryValStored)); /** fetch the compressed string field and compare it's content with the original one */ String stringFldCompressedTest = new String(docFromReader.get("stringCompressed")); assertTrue(stringFldCompressedTest.equals(binaryValCompressed)); /** delete the document from index */ reader.delete(0); assertEquals(0, reader.numDocs()); reader.close(); } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]