[
https://issues.apache.org/jira/browse/LUCENE-3425?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13766169#comment-13766169
]
caviler commented on LUCENE-3425:
---------------------------------
Yes, i changed the default MergePolicy to AverageMergePolicy, i closed old
reader after reopenIfChanged,
before upgrade to Lucene 4.4.0, we use this AverageMergePolicy is work well,
recently we upgrade to Lucene 4.4.0 and modified this AverageMergePolicy to
adapt version 4.4.0, after this, in testing(in very short time we add a lot of
documents), we noticed the small segments is so many(more than 40,000), and
those small segments seems not have chance to be merged?
What difference between version 3.6.2 and 4.4.0 behavior in merge process???
our index files about 72G, split to 148 segment by AverageMergePolicy, every
segment about 500M size.
{code}
public class AverageMergePolicy extends MergePolicy
{
/**
* Default noCFSRatio. If a merge's size is >= 10% of the index, then we
* disable compound file for it.
*
* @see #setNoCFSRatio
*/
public static final double DEFAULT_NO_CFS_RATIO = 0.1;
private long maxSegmentSizeMB = 100L; //
protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
private boolean partialExpunge = false; //
protected boolean useCompoundFile = true;
public AverageMergePolicy()
{
}
@Override
public void close()
{
}
@Override
public MergeSpecification findForcedDeletesMerges(final SegmentInfos infos)
throws CorruptIndexException,
IOException
{
try
{
//
SegmentInfoPerCommit best = null;
final int numSegs = infos.size();
for (int i = 0; i < numSegs; i++)
{
final SegmentInfoPerCommit info = infos.info(i);
if (info.hasDeletions())
{
if (null == best || info.getDelCount() > best.getDelCount())
{
best = info;
}
}
}
final Collection<SegmentInfoPerCommit> mergingSegments =
writer.get().getMergingSegments();
if (mergingSegments.contains(best))
{
return null; // skip merging segment
}
final MergeSpecification spec = new MergeSpecification();
if (null != best)
{
spec.add(new OneMerge(Collections.singletonList(best)));
}
return spec;
}
catch (final Throwable e)
{
e.printStackTrace();
return null;
}
}
@Override
public MergeSpecification findForcedMerges(final SegmentInfos infos,
final int maxNumSegments,
final Map<SegmentInfoPerCommit,
Boolean> segmentsToMerge)
throws IOException
{
return findMerges(MergeTrigger.EXPLICIT, infos);
}
@Override
public MergeSpecification findMerges(final MergeTrigger mergeTrigger, final
SegmentInfos infos) throws IOException
{
// partialExpunge = false; //
// partialExpunge = true;
final long maxSegSize = maxSegmentSizeMB * 1024L * 1024L; //
long bestSegSize = maxSegSize; //
try
{
final int numSegs = infos.size();
int numBestSegs = numSegs;
{
//
SegmentInfoPerCommit info;
long totalSegSize = 0;
// compute the total size of segments
for (int i = 0; i < numSegs; i++)
{
info = infos.info(i);
final long size = size(info);
totalSegSize += size;
}
numBestSegs = (int) ((totalSegSize + bestSegSize - 1) /
bestSegSize); //
bestSegSize = (numBestSegs == 0) ? totalSegSize : (totalSegSize
+ maxSegSize - 1) / numBestSegs; //
if (bestSegSize > maxSegSize)
{
bestSegSize = maxSegSize; //
numBestSegs = (int) ((totalSegSize + maxSegSize - 1) /
bestSegSize);
}
}
MergeSpecification spec = findOneMerge(infos, bestSegSize);
//int branch = 0;
if (null == spec && partialExpunge)
{
//branch = 1;
//
final OneMerge expunge = findOneSegmentToExpunge(infos, 0);
if (expunge != null)
{
spec = new MergeSpecification();
spec.add(expunge);
}
}
// MergeLogger.collect(branch, spec);
Application.sleep(100); //
return spec;
}
catch (final Throwable e)
{
e.printStackTrace();
return null;
}
}
/**
*
*
*
*
* @param sizes
* @param bestSegSize
*
* @return
*/
private int[] findOneMerge(final long[] sizes, final long bestSegSize)
{
final int[] merge = findSmallestOneMerge(sizes, bestSegSize);
if (null != merge)
{
return merge;
}
final int n = sizes.length;
for (int i = 0; i < n; i++)
{
final long size1 = sizes[i];
if (size1 < bestSegSize) //
{
//
for (int j = 0; j < n; j++)
{
if (i != j)
{
final long size2 = sizes[j];
if (size1 + size2 <= bestSegSize) //
{
return new int[]
{ i, j };
}
}
}
}
}
return null;
}
/**
*
*/
private MergeSpecification findOneMerge(final SegmentInfos infos, final
long bestSegSize) throws IOException
{
final int n = infos.size();
final long[] sizes = new long[n];
//
{
SegmentInfoPerCommit info;
for (int i = 0; i < n; i++)
{
info = infos.info(i);
sizes[i] = size(info);
}
}
final int[] pair = findOneMerge(sizes, bestSegSize);
if (null == pair)
{
return null; //
}
final int target1 = pair[0];
final int target2 = pair[1];
final SegmentInfoPerCommit info1 = infos.info(target1);
final SegmentInfoPerCommit info2 = infos.info(target2);
final Collection<SegmentInfoPerCommit> mergingSegments =
writer.get().getMergingSegments();
if (mergingSegments.contains(info1))
{
return null; // skip merging segment
}
if (mergingSegments.contains(info2))
{
return null; // skip merging segment
}
// MergeLogger.debug("findOneMerge info1 = " + info1.name + " " +
SizeUtil.normalizeSizeString(sizes[target1]));
// MergeLogger.debug("findOneMerge info2 = " + info2.name + " " +
SizeUtil.normalizeSizeString(sizes[target2]));
final List<SegmentInfoPerCommit> mergeInfos = new
ArrayList<SegmentInfoPerCommit>(2);
mergeInfos.add(info1);
mergeInfos.add(info2);
final MergeSpecification spec = new MergeSpecification();
spec.add(new OneMerge(mergeInfos));
return spec;
}
/**
*
*
* @param infos
* @param maxNumSegments
* @return
* @throws IOException
*/
private OneMerge findOneSegmentToExpunge(final SegmentInfos infos, final
int maxNumSegments) throws IOException
{
int expungeCandidate = -1;
int maxDelCount = 0;
for (int i = maxNumSegments - 1; i >= 0; i--)
{
final SegmentInfoPerCommit info = infos.info(i);
final int delCount = info.getDelCount();
if (delCount > maxDelCount)
{
expungeCandidate = i;
maxDelCount = delCount;
}
}
if (maxDelCount > 0)
{
return new
OneMerge(Collections.singletonList(infos.info(expungeCandidate)));
}
return null;
}
/**
*
*
* @param sizes
* @param bestSegSize
* @return
*/
private int[] findSmallestOneMerge(final long[] sizes, final long
bestSegSize)
{
long targetSize = -1;
int targetIndex = -1;
final int n = sizes.length;
final int skip = 0; //
//
for (int i = n - skip - 1; i >= 0; i--) //
{
final long size1 = sizes[i];
if (size1 < bestSegSize) //
{
if (-1 == targetSize || size1 < targetSize)
{
targetSize = size1;
targetIndex = i;
}
}
}
if (-1 != targetIndex)
{
for (int j = n - skip - 1; j >= 0; j--)//
{
if (targetIndex != j)
{
final long size2 = sizes[j];
if (targetSize + size2 <= bestSegSize) //
{
return new int[]
{ targetIndex, j };
}
}
}
}
return null;
}
public long getMaxSegmentSizeMB()
{
return maxSegmentSizeMB;
}
// /** @see #setNoCFSRatio */
// @Override
// public double getNoCFSRatio()
// {
// return noCFSRatio;
// }
public boolean getPartialExpunge()
{
return partialExpunge;
}
/**
* Returns true if newly flushed and newly merge segments are written in
* compound file format. @see #setUseCompoundFile
*/
public boolean getUseCompoundFile()
{
return useCompoundFile;
}
public void setMaxSegmentSizeMB(final long maxSegmentSizeMB)
{
this.maxSegmentSizeMB = maxSegmentSizeMB;
}
// /**
// * If a merged segment will be more than this percentage of the total
size
// * of the index, leave the segment as non-compound file even if
compound
// * file is enabled. Set to 1.0 to always use CFS regardless of merge
size.
// */
// @Override
// public void setNoCFSRatio(final double noCFSRatio)
// {
// if (noCFSRatio < 0.0 || noCFSRatio > 1.0)
// {
// throw new IllegalArgumentException("noCFSRatio must be 0.0 to
1.0 inclusive; got " + noCFSRatio);
// }
// this.noCFSRatio = noCFSRatio;
// }
public void setPartialExpunge(final boolean partialExpunge)
{
this.partialExpunge = partialExpunge;
}
/**
* Sets whether compound file format should be used for newly flushed and
* newly merged segments.
*/
public void setUseCompoundFile(final boolean useCompoundFile)
{
this.useCompoundFile = useCompoundFile;
}
// Javadoc inherited
@Override
public boolean useCompoundFile(final SegmentInfos infos, final
SegmentInfoPerCommit mergedInfo) throws IOException
{
final boolean doCFS;
if (!useCompoundFile)
{
doCFS = false;
}
else if (noCFSRatio == 1.0)
{
doCFS = true;
}
else
{
long totalSize = 0;
for (final SegmentInfoPerCommit info : infos)
{
totalSize += size(info);
}
doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
}
return doCFS;
}
}
{code}
> NRT Caching Dir to allow for exact memory usage, better buffer allocation and
> "global" cross indices control
> ------------------------------------------------------------------------------------------------------------
>
> Key: LUCENE-3425
> URL: https://issues.apache.org/jira/browse/LUCENE-3425
> Project: Lucene - Core
> Issue Type: Improvement
> Components: core/index
> Affects Versions: 3.4, 4.0-ALPHA
> Reporter: Shay Banon
> Fix For: 5.0, 4.5
>
>
> A discussion on IRC raised several improvements that can be made to NRT
> caching dir. Some of the problems it currently has are:
> 1. Not explicitly controlling the memory usage, which can result in overusing
> memory (for example, large new segments being committed because refreshing is
> too far behind).
> 2. Heap fragmentation because of constant allocation of (probably promoted to
> old gen) byte buffers.
> 3. Not being able to control the memory usage across indices for multi index
> usage within a single JVM.
> A suggested solution (which still needs to be ironed out) is to have a
> BufferAllocator that controls allocation of byte[], and allow to return
> unused byte[] to it. It will have a cap on the size of memory it allows to be
> allocated.
> The NRT caching dir will use the allocator, which can either be provided (for
> usage across several indices) or created internally. The caching dir will
> also create a wrapped IndexOutput, that will flush to the main dir if the
> allocator can no longer provide byte[] (exhausted).
> When a file is "flushed" from the cache to the main directory, it will return
> all the currently allocated byte[] to the BufferAllocator to be reused by
> other "files".
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]