All Downloads are FREE. Search and download functionalities are using the official Maven repository.

proj.zoie.api.impl.ZoieMergePolicy Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version

package proj.zoie.api.impl;
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;

/**
 * @author ymatsuda
 *
 */
public class ZoieMergePolicy extends LogByteSizeMergePolicy
{
  public static final Logger log = Logger.getLogger(ZoieMergePolicy.class.getName());
  public static final int DEFAULT_NUM_LARGE_SEGMENTS = 6;
  public static final int DEFAULT_NUM_SMALL_SEGMENTS = 7;
  public static final int DEFAULT_MERGE_FACTOR = 6;

  private boolean _partialExpunge = false;
  private int _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
  private int _maxSmallSegments = DEFAULT_NUM_SMALL_SEGMENTS; // default merge factor plus 1.
  private int _maxSegments = _numLargeSegments + _maxSmallSegments;

  public ZoieMergePolicy()
  {
    super.setMergeFactor(DEFAULT_MERGE_FACTOR);// set default merge factor to 7. Less than 10. Good for search speed.
  }

  public void setMergePolicyParams(MergePolicyParams params){
    if (params!=null){
      setPartialExpunge(params._doPartialExpunge);
      setNumLargeSegments(params._numLargeSegments);
      setMergeFactor(params._mergeFactor);
      setMaxSmallSegments(params._maxSmallSegments);
      setPartialExpunge(params._doPartialExpunge);
      setUseCompoundFile(params._useCompoundFile);
      setMaxMergeDocs(params._maxMergeDocs);
    }
  }

  protected long size(SegmentInfo info) throws IOException
  {
    long byteSize = info.sizeInBytes(false);
    float delRatio = (info.docCount <= 0 ? 0.0f : ((float)info.getDelCount() / (float)info.docCount));
    return (info.docCount <= 0 ?  byteSize : (long)((float)byteSize * (1.0f - delRatio)));
  }

  public void setPartialExpunge(boolean doPartialExpunge)
  {
    _partialExpunge = doPartialExpunge;
  }

  public boolean getPartialExpunge()
  {
    return _partialExpunge;
  }

  public void setNumLargeSegments(int numLargeSegments)
  {
    if (numLargeSegments < 2)
    {
      log.warn("numLargeSegments cannot be less than 2, while " + numLargeSegments + " is requested. Override with 2.");
      numLargeSegments = 2;
    }
    _numLargeSegments = numLargeSegments;
    _maxSegments = _numLargeSegments + 2 * getMergeFactor();
  }

  public int getNumLargeSegments()
  {
    return _numLargeSegments;
  }

  public void setMaxSmallSegments(int maxSmallSegments)
  {
    if (maxSmallSegments < getMergeFactor()+1)
    {
      log.warn("MergeFactor is " +getMergeFactor() + ". maxSmallSegments is requested to be: "
          + maxSmallSegments + ". Override with mergeFactor + 1, since maxSmallSegments has to be greater than mergeFactor.");
      maxSmallSegments = getMergeFactor() + 1;
    }
    _maxSmallSegments = maxSmallSegments;
    _maxSegments = _numLargeSegments + _maxSmallSegments;
  }

  public int getMaxSmallSegments()
  {
    return _maxSmallSegments;
  }

  @Override
  public void setMergeFactor(int mergeFactor)
  {
    if (mergeFactor<2)
    {
      log.warn("mergeFactor has to be at least 2. Override " + mergeFactor + " with 2");
      mergeFactor = 2;
    }
    super.setMergeFactor(mergeFactor);
    if(_maxSmallSegments < getMergeFactor())
    {
      log.warn("maxSmallSegments has to be greater than mergeFactor. Override maxSmallSegments to: " + (mergeFactor + 1));
      _maxSmallSegments = getMergeFactor() + 1;
      _maxSegments = _numLargeSegments + _maxSmallSegments;
    }
  }

  @Override
  protected boolean isMerged(SegmentInfos infos,int maxNumSegments,Map segmentsToOptimize) throws IOException {
    final int numSegments = infos.size();
    int numToOptimize = 0;
    SegmentInfo optimizeInfo = null;
    for(int i=0;i segmentsToOptimize) throws IOException {

    assert maxNumSegments > 0;

    MergeSpecification spec = null;

    if (!isMerged(infos, maxNumSegments, segmentsToOptimize))
    {
      // Find the newest (rightmost) segment that needs to
      // be optimized (other segments may have been flushed
      // since optimize started):
      int last = infos.size();
      while(last > 0)
      {
        final SegmentInfo info = infos.info(--last);
        if (segmentsToOptimize.get(info))
        {
          last++;
          break;
        }
      }

      if (last > 0)
      {
        if (maxNumSegments == 1)
        {
          // Since we must optimize down to 1 segment, the
          // choice is simple:
         // boolean useCompoundFile = getUseCompoundFile();
          if (last > 1 || !isMerged(infos.info(0)))
          {
            spec = new MergeSpecification();
            spec.add(new OneMerge(infos.asList().subList(0, last)));
          }
        }
        else if (last > maxNumSegments)
        {    
          // find most balanced merges
          spec = findBalancedMerges(infos, last, maxNumSegments, _partialExpunge);
        }
      }
    }
    return spec;
  }

  private MergeSpecification findBalancedMerges(SegmentInfos infos, int infoLen, int maxNumSegments, boolean partialExpunge)
  throws IOException
  {
    if (infoLen <= maxNumSegments) return null;

    MergeSpecification spec = new MergeSpecification();
   // boolean useCompoundFile = getUseCompoundFile();

    // use Viterbi algorithm to find the best segmentation.
    // we will try to minimize the size variance of resulting segments.

    double[][] variance = createVarianceTable(infos, infoLen, maxNumSegments);

    final int maxMergeSegments = infoLen - maxNumSegments + 1;
    double[] sumVariance = new double[maxMergeSegments];
    int[][] backLink = new int[maxNumSegments][maxMergeSegments];

    for(int i = (maxMergeSegments - 1); i >= 0; i--)
    {
      sumVariance[i] = variance[0][i];
      backLink[0][i] = 0;
    }
    for(int i = 1; i < maxNumSegments; i++)
    {
      for(int j = (maxMergeSegments - 1); j >= 0; j--)
      {
        double minV = Double.MAX_VALUE;
        int minK = 0;
        for(int k = j; k >= 0; k--)
        {
          double v = sumVariance[k] + variance[i + k][j - k];
          if(v < minV)
          {
            minV = v;
            minK = k;
          }
        }
        sumVariance[j] = minV;
        backLink[i][j] = minK;
      }
    }

    // now, trace back the back links to find all merges,
    // also find a candidate for partial expunge if requested
    int mergeEnd = infoLen;
    int prev = maxMergeSegments - 1;
    int expungeCandidate = -1;
    int maxDelCount = 0;
    for(int i = maxNumSegments - 1; i >= 0; i--)
    {
      prev = backLink[i][prev];
      int mergeStart = i + prev;
      if((mergeEnd - mergeStart) > 1)
      {
        spec.add(new OneMerge(infos.asList().subList(mergeStart, mergeEnd)));
      }
      else
      {
        if(partialExpunge)
        {
          SegmentInfo info = infos.info(mergeStart);
          int delCount = info.getDelCount();
          if(delCount > maxDelCount)
          {
            expungeCandidate = mergeStart;
            maxDelCount = delCount;
          }
        }
      }
      mergeEnd = mergeStart;
    }

    if(partialExpunge && maxDelCount > 0)
    {
      // expunge deletes
      spec.add(new OneMerge(infos.asList().subList(expungeCandidate, expungeCandidate + 1)));
    }

    return spec;
  }

  private double[][] createVarianceTable(SegmentInfos infos, int last, int maxNumSegments) throws IOException
  {
    int maxMergeSegments = last - maxNumSegments + 1;
    double[][] variance = new double[last][maxMergeSegments];

    // compute the optimal segment size
    long optSize = 0;
    long[] sizeArr = new long[last];
    for(int i = 0; i < sizeArr.length; i++)
    {
      sizeArr[i] = size(infos.info(i));
      optSize += sizeArr[i];
    }
    optSize = (optSize / maxNumSegments);

    for(int i = 0; i < last; i++)
    {
      long size = 0;
      for(int j = 0; j < maxMergeSegments; j++)
      {
        if((i + j) < last)
        {
          size += sizeArr[i + j];
          double residual = ((double)size/(double)optSize) - 1.0d;
          variance[i][j] = residual * residual;
        }
        else
        {
          variance[i][j] = Double.NaN;
        }
      }
    }
    return variance;
  }

  /**
   * Finds merges necessary to expunge all deletes from the
   * index. The number of large segments will stay the same.
   */ 
  @Override
  public MergeSpecification findForcedDeletesMerges(SegmentInfos infos)
  throws CorruptIndexException, IOException
  {
    final int numSegs = infos.size();
    final int numLargeSegs = (numSegs < _numLargeSegments ? numSegs : _numLargeSegments);
    MergeSpecification spec = null;

    if(numLargeSegs < numSegs)
    {
      List smallSegmentList = infos.asList().subList(numLargeSegs, numSegs);
      SegmentInfos smallSegments = new SegmentInfos();
      smallSegments.addAll(smallSegmentList);
      spec = super.findForcedDeletesMerges(smallSegments);
    }

    if(spec == null) spec = new MergeSpecification();
    for(int i = 0; i < numLargeSegs; i++)
    {
      SegmentInfo info = infos.info(i);
      if(info.hasDeletions())
      {
        spec.add(new OneMerge(infos.asList().subList(i, i + 1)));        
      }
    }
    return spec;
  }

  /** Checks if any merges are now necessary and returns a
   *  {@link MergePolicy.MergeSpecification} if so.
   *  This merge policy try to maintain {@link
   *  #setNumLargeSegments} of large segments in similar sizes.
   *  {@link LogByteSizeMergePolicy} to small segments.
   *  Small segments are merged and promoted to a large segment
   *  when the total size reaches the average size of large segments.
   */
  @Override
  public MergeSpecification findMerges(SegmentInfos infos) throws IOException
  {
    final int numSegs = infos.size();
    final int numLargeSegs = _numLargeSegments;

    if(numSegs <= numLargeSegs) return null;

    long totalLargeSegSize = 0;
    long totalSmallSegSize = 0;
    SegmentInfo info;

    // compute the total size of large segments
    for(int i = 0; i < numLargeSegs; i++)
    {
      info = infos.info(i);
      totalLargeSegSize += size(info);
    }
    // compute the total size of small segments
    for(int i = numLargeSegs; i < numSegs; i++)
    {
      info = infos.info(i);
      totalSmallSegSize += size(info);
    }

    long targetSegSize = (totalLargeSegSize / (numLargeSegs - 1));
    if(targetSegSize <= totalSmallSegSize)
    {
      // the total size of small segments is big enough,
      // promote the small segments to a large segment and do balanced merge,

      if(totalSmallSegSize < targetSegSize * 2)
      {
        MergeSpecification spec = findBalancedMerges(infos, numLargeSegs, (numLargeSegs - 1), _partialExpunge);
        if(spec == null) spec = new MergeSpecification(); // should not happen
        spec.add(new OneMerge(infos.asList().subList(numLargeSegs, numSegs)));
        return spec;
      }
      else
      {
        return findBalancedMerges(infos, numSegs, numLargeSegs, _partialExpunge);
      }      
    }
    else if(_maxSegments < numSegs)
    {
      // we have more than _maxSegments, merge small segments smaller than targetSegSize/4
      MergeSpecification spec = new MergeSpecification();
      int startSeg = numLargeSegs;
      long sizeThreshold = (targetSegSize / 4);
      while(startSeg < numSegs)
      {
        info = infos.info(startSeg);
        if(size(info) < sizeThreshold) break;
        startSeg++;
      }
      spec.add(new OneMerge(infos.asList().subList(startSeg, numSegs)));
      return spec;
    }
    else
    {
      // apply the log merge policy to small segments.
      List smallSegmentList = infos.asList().subList(numLargeSegs, numSegs);
      SegmentInfos smallSegments = new SegmentInfos();
      smallSegments.addAll(smallSegmentList);
      MergeSpecification spec = super.findMerges(smallSegments);

      if(_partialExpunge)
      {
        OneMerge expunge  = findOneSegmentToExpunge(infos, numLargeSegs);
        if(expunge != null)
        {
          if(spec == null) spec = new MergeSpecification();
          spec.add(expunge);
        }
      }
      return spec;
    }      
  }

  private OneMerge findOneSegmentToExpunge(SegmentInfos infos, int maxNumSegments) throws IOException
  {
    int expungeCandidate = -1;
    int maxDelCount = 0;

    for(int i = maxNumSegments - 1; i >= 0; i--)
    {
      SegmentInfo info = infos.info(i);
      int delCount = info.getDelCount();
      if(delCount > maxDelCount)
      {
        expungeCandidate = i;
        maxDelCount = delCount;
      }
    }
    if(maxDelCount > 0)
    {
      return new OneMerge(infos.asList().subList(expungeCandidate, expungeCandidate + 1));
    }
    return null;
  }


  public static class MergePolicyParams
  {
    public static final Logger log = Logger.getLogger(ZoieMergePolicy.MergePolicyParams.class.getName());
    private int _numLargeSegments;
    private int _maxSmallSegments;
    private boolean _doPartialExpunge;
    private int _mergeFactor;
    private boolean _useCompoundFile;
    private int _maxMergeDocs;

    public MergePolicyParams()
    {
      _useCompoundFile = false;
      _doPartialExpunge = false;
      _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
      _maxSmallSegments = DEFAULT_NUM_SMALL_SEGMENTS;//2 * LogMergePolicy.DEFAULT_MERGE_FACTOR;
      _mergeFactor = DEFAULT_MERGE_FACTOR;//LogMergePolicy.DEFAULT_MERGE_FACTOR;
      _maxMergeDocs = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS;
    }
    public String toString()
    {
      StringBuffer sb = new StringBuffer();
      sb.append("useCompoundFile: ").append(_useCompoundFile);
      sb.append(", doPartialExpunge: ").append(_doPartialExpunge);
      sb.append(", numLargeSegments: ").append(_numLargeSegments);
      sb.append(", maxSmallSegments: ").append(_maxSmallSegments);
      sb.append(", mergeFactor: ").append(_mergeFactor);
      sb.append(", maxMergeDocs: ").append(_maxMergeDocs);
      return sb.toString();
    }

    public synchronized void setNumLargeSegments(int numLargeSegments)
    {
      if (numLargeSegments < 2)
      {
        log.warn("numLargeSegments cannot be less than 2, while " + numLargeSegments + " is requested. Override with 2.");
        numLargeSegments = 2;
      }
      _numLargeSegments = numLargeSegments;
      log.info(this.toString());
    }

    public synchronized int getNumLargeSegments()
    {
      return _numLargeSegments;
    }

    public synchronized void setMaxSmallSegments(int maxSmallSegments)
    {
      if (maxSmallSegments < getMergeFactor()+1)
      {
        log.warn("MergeFactor is " +getMergeFactor() + ". maxSmallSegments is requested to be: "
            + maxSmallSegments + ". Override with mergeFactor + 1, since maxSmallSegments has to be greater than mergeFactor.");
        maxSmallSegments = getMergeFactor() + 1;
      }
      _maxSmallSegments = maxSmallSegments;
      log.info(this.toString());
    }

    public synchronized int getMaxSmallSegments()
    {
      return _maxSmallSegments;
    }

    public synchronized void setPartialExpunge(boolean doPartialExpunge)
    {
      _doPartialExpunge = doPartialExpunge;
      log.info(this.toString());
    }

    public synchronized boolean getPartialExpunge()
    {
      return _doPartialExpunge;
    }

    public synchronized void setMergeFactor(int mergeFactor)
    {
      if (mergeFactor<2)
      {
        log.warn("mergeFactor has to be at least 2. Override " + mergeFactor + " with 2");
        mergeFactor = 2;
      }
      _mergeFactor = mergeFactor;
      if(_maxSmallSegments < getMergeFactor())
      {
        log.warn("maxSmallSegments has to be greater than mergeFactor. Override maxSmallSegments to: " + (mergeFactor + 1));
        _maxSmallSegments = getMergeFactor() + 1;
      }
      log.info(this.toString());
    }

    public synchronized int getMergeFactor()
    {
      return _mergeFactor;
    }

    public synchronized void setMaxMergeDocs(int maxMergeDocs)
    {
      _maxMergeDocs = maxMergeDocs;
      log.info(this.toString());
    }

    public synchronized int getMaxMergeDocs()
    {
      return _maxMergeDocs;
    }

    public synchronized void setUseCompoundFile(boolean useCompoundFile)
    {
      _useCompoundFile = useCompoundFile;
      log.info(this.toString());
    }

    public synchronized boolean isUseCompoundFile()
    {
      return _useCompoundFile;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy