Controlling the maximum size of a segment during indexing

Lance Norskog Thu, 08 Apr 2010 22:02:04 -0700

Here is a Java unit test that uses the LogByteSizeMergePolicy to
control the maximum size of segment files during indexing. That is, it
tries. It does not succeed. Will someone who truly understands the
merge policy code please examine it. There is probably one tiny
parameter missing.


It adds 20 documents that each are 100k in size.

It creates an index in a RAMDirectory which should have one segment
that's a tad over 1mb, and then a set of segments that are a tad over
500k. Instead, the data does not flush until it commits, writing one
5m segment.


-------------------------------------------------------------
org.apache.lucene.index.TestIndexWriterMergeMB
-------------------------------------------------------------------------------

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;

/*
 * Verify that segment sizes are limited to # of bytes.
 *
 * Sizing:
 *  Max MB is 0.5m. Verify against thiAs plus 100k slop. (1.2x)
 *  Min MB is 10k.
 *  Each document is 100k.
 *  mergeSegments=2
 *  MaxRAMBuffer=1m. Verify against this plus 200k slop. (1.2x)
 *
 *  This test should cause the ram buffer to flush after 10 documents,
and create a CFS a little over 1meg.
 *  The later documents should be flushed to disk every 5-6 documents,
and create CFS files a little over 0.5meg.
 */


public class TestIndexWriterMergeMB extends LuceneTestCase {
  private static final int MERGE_FACTOR = 2;
  private static final double RAMBUFFER_MB = 1.0;
  static final double MIN_MB = 0.01d;
  static final double MAX_MB = 0.5d;
  static final double SLOP_FACTOR = 1.2d;
  static final double MB = 1000*1000;
  static String VALUE_100k = null;

  // Test controlling the mergePolicy for max # of docs
  public void testMaxMergeMB() throws IOException {
    Directory dir = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(
        TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));

    LogByteSizeMergePolicy mergeMB = new LogByteSizeMergePolicy();
    config.setMergePolicy(mergeMB);
    mergeMB.setMinMergeMB(MIN_MB);
    mergeMB.setMaxMergeMB(MAX_MB);
    mergeMB.setUseCompoundFile(true);
    mergeMB.setMergeFactor(MERGE_FACTOR);
    config.setMaxBufferedDocs(100);                    // irrelevant
but the next line fails without this.
    config.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    MergeScheduler scheduler = new SerialMergeScheduler();
    config.setMergeScheduler(scheduler);
    IndexWriter writer = new IndexWriter(dir, config);

    System.out.println("Start indexing");
    for (int i = 0; i < 50; i++) {
      addDoc(writer, i);
      printSegmentSizes(dir);
    }
    checkSegmentSizes(dir);
    System.out.println("Commit");
    writer.commit();
    printSegmentSizes(dir);
    checkSegmentSizes(dir);
    writer.close();
  }

  // document that takes of 100k of RAM
  private void addDoc(IndexWriter writer, int i) throws IOException {
    if (VALUE_100k == null) {
      StringBuilder value = new StringBuilder(100000);
      for(int fill = 0; fill < 100000; fill ++) {
        value.append('a');
      }
      VALUE_100k = value.toString();
    }
    Document doc = new Document();
    doc.add(new Field("id", i + "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("content", VALUE_100k, Field.Store.YES,
Field.Index.NOT_ANALYZED));
    writer.addDocument(doc);
  }


  private void checkSegmentSizes(Directory dir) {
    try {
      String[] files = dir.listAll();
      for (String file : files) {
        if (file.equals("_0.cfs")) {
          long length = dir.fileLength(file);
          assertTrue("First segment: " + file + " size = " + length + " < "
              + (int) ((SLOP_FACTOR * RAMBUFFER_MB) * MB), length <
(SLOP_FACTOR * RAMBUFFER_MB) * MB);
        } else if (file.endsWith(".cfs")) {
          long length = dir.fileLength(file);
          assertTrue("Later segment: " + file + " size = " + length +
" should be < "
              + (int) ((SLOP_FACTOR * MAX_MB) * MB), length <
(SLOP_FACTOR * MAX_MB) * MB);
        }
      }
    } catch (IOException e) {
      System.err.println("Impossible: " + e.getMessage());
    }
  }

  private void printSegmentSizes(Directory dir) {
    try {
      String[] files = dir.listAll();
      System.out.println("Print index");
      for(String file: files) {
        if (file.endsWith(".cfs")) {
          long length = dir.fileLength(file);
          System.out.println("  file: " + file + " has " + length + " bytes");
        }
      }
    } catch (IOException e) {
      System.err.println("Impossible: " + e.getMessage());
    }
  }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Controlling the maximum size of a segment during indexing

Reply via email to