org.apache.solr.search.HashDocSet Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr Server
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search;

import org.apache.lucene.util.BitUtil;


/**
 * HashDocSet represents an unordered set of Lucene Document Ids
 * using a primitive int hash table.  It can be a better choice if there are few docs
 * in the set because it takes up less memory and is faster to iterate and take
 * set intersections.
 *
 * @version $Id: HashDocSet.java 790938 2009-07-03 15:09:21Z yonik $
 * @since solr 0.9
 */
public final class HashDocSet extends DocSetBase {
  /** Default load factor to use for HashDocSets.  We keep track of the inverse
   *  since multiplication is so much faster than division.  The default
   *  is 1.0f / 0.75f
   */
  static float DEFAULT_INVERSE_LOAD_FACTOR = 1.0f /0.75f;

  // public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1);


  // lucene docs are numbered from 0, so a neg number must be used for missing.
  // an alternative to having to init the array to EMPTY at the start is
  //
  private final static int EMPTY=-1;
  private final int[] table;
  private final int size;

  private final int mask;

  /** Create a HashDocSet from a list of *unique* ids */
  public HashDocSet(int[] docs, int offset, int len) {
    this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
  }

  /** Create a HashDocSet from a list of *unique* ids */  
  public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
    int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
    if (tsize < len * inverseLoadFactor) {
      tsize <<= 1;
    }

    mask=tsize-1;

    table = new int[tsize];
    // (for now) better then: Arrays.fill(table, EMPTY);
    // https://issues.apache.org/jira/browse/SOLR-390
    for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;

    int end = offset + len;
    for (int i=offset; i>7)|1) only once.
      // otherwise, we would need to pull the first case out of the loop.
      s = (s + ((doc>>7)|1)) & mask;
    }
    table[s]=doc;
  }

  public boolean exists(int doc) {
    int s = doc & mask;
    for(;;) {
      int v = table[s];
      if (v==EMPTY) return false;
      if (v==doc) return true;
      // see put() for algorithm details.
      s = (s + ((doc>>7)|1)) & mask;
    }
  }


  public int size() {
    return size;
  }

  public DocIterator iterator() {
    return new DocIterator() {
      int pos=0;
      int doc;
      { goNext(); }

      public boolean hasNext() {
        return pos < table.length;
      }

      public Integer next() {
        return nextDoc();
      }

      public void remove() {
      }

      void goNext() {
        while (pos= 0 && b.exists(id)) {
         result[resultCount++]=id;
       }
     }
     return new HashDocSet(result,0,resultCount);

   } else {

     int[] result = new int[size()];
     int resultCount=0;
     for (int i=0; i= 0 && other.exists(id)) {
         result[resultCount++]=id;
       }
     }
     return new HashDocSet(result,0,resultCount);
   }

  }

  @Override
  public int intersectionSize(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set for the most efficient
     // intersection.
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;

     int resultCount=0;
     for (int i=0; i= 0 && b.exists(id)) {
         resultCount++;
       }
     }
     return resultCount;
   } else {
     int resultCount=0;
     for (int i=0; i= 0 && other.exists(id)) {
         resultCount++;
       }
     }
     return resultCount;
   }

  }


  @Override
  public DocSet andNot(DocSet other) {
    int[] result = new int[size()];
    int resultCount=0;

    for (int i=0; i= 0 && !other.exists(id)) {
        result[resultCount++]=id;
      }
    }
    return new HashDocSet(result,0,resultCount);
  }

  @Override
  public DocSet union(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;

     int[] result = new int[a.size()+b.size()];
     int resultCount=0;
     // iterate over the largest table first, adding w/o checking.
     for (int i=0; i=0) result[resultCount++]=id;
     }

     // now iterate over smaller set, adding all not already in larger set.
     for (int i=0; i=0 && !b.exists(id)) result[resultCount++]=id;
     }

     return new HashDocSet(result,0,resultCount);
   } else {
     return other.union(this);
   }
  }


  // don't implement andNotSize() and unionSize() on purpose... they are implemented
  // in BaseDocSet in terms of intersectionSize().
}