All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.text.LuceneStorageConfiguration Maven / Gradle / Ivy

Go to download

Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.

There is a newer version: 0.13.0
Show newest version
package org.apache.mahout.text;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import com.google.common.base.Preconditions;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;

import static org.apache.lucene.util.Version.LUCENE_46;

/**
 * Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, which generates a sequence file
 * with id as the key and a content field as value.
 */
public class LuceneStorageConfiguration implements Writable {

  private static final Query DEFAULT_QUERY = new MatchAllDocsQuery();
  private static final int DEFAULT_MAX_HITS = Integer.MAX_VALUE;

  static final String KEY = "org.apache.mahout.text.LuceneIndexToSequenceFiles";

  static final String SEPARATOR_FIELDS = ",";
  static final String SEPARATOR_PATHS = ",";

  private Configuration configuration;
  private List indexPaths;
  private Path sequenceFilesOutputPath;
  private String idField;
  private List fields;
  private Query query;
  private int maxHits;

  /**
   * Create a configuration bean with all mandatory parameters.
   *
   * @param configuration           Hadoop configuration for writing sequencefiles
   * @param indexPaths              paths to the index
   * @param sequenceFilesOutputPath path to output the sequence file
   * @param idField                 field used for the key of the sequence file
   * @param fields                  field(s) used for the value of the sequence file
   */
  public LuceneStorageConfiguration(Configuration configuration, List indexPaths, Path sequenceFilesOutputPath,
                                    String idField, List fields) {
    Preconditions.checkArgument(configuration != null, "Parameter 'configuration' cannot be null");
    Preconditions.checkArgument(indexPaths != null, "Parameter 'indexPaths' cannot be null");
    Preconditions.checkArgument(indexPaths != null && !indexPaths.isEmpty(), "Parameter 'indexPaths' cannot be empty");
    Preconditions.checkArgument(sequenceFilesOutputPath != null, "Parameter 'sequenceFilesOutputPath' cannot be null");
    Preconditions.checkArgument(idField != null, "Parameter 'idField' cannot be null");
    Preconditions.checkArgument(fields != null, "Parameter 'fields' cannot be null");
    Preconditions.checkArgument(fields != null && !fields.isEmpty(), "Parameter 'fields' cannot be empty");

    this.configuration = configuration;
    this.indexPaths = indexPaths;
    this.sequenceFilesOutputPath = sequenceFilesOutputPath;
    this.idField = idField;
    this.fields = fields;

    this.query = DEFAULT_QUERY;
    this.maxHits = DEFAULT_MAX_HITS;
  }

  public LuceneStorageConfiguration() {
    // Used during serialization. Do not use.
  }

  /**
   * Deserializes a {@link LuceneStorageConfiguration} from a {@link Configuration}.
   *
   * @param conf the {@link Configuration} object with a serialized {@link LuceneStorageConfiguration}
   * @throws IOException if deserialization fails
   */
  public LuceneStorageConfiguration(Configuration conf) throws IOException {
    Preconditions.checkNotNull(conf, "Parameter 'configuration' cannot be null");

    String serializedConfigString = conf.get(KEY);

    if (serializedConfigString == null) {
      throw new IllegalArgumentException("Parameter 'configuration' does not contain a serialized " + this.getClass());
    }

    LuceneStorageConfiguration luceneStorageConf = DefaultStringifier.load(conf, KEY, LuceneStorageConfiguration.class);

    this.configuration = conf;
    this.indexPaths = luceneStorageConf.getIndexPaths();
    this.sequenceFilesOutputPath = luceneStorageConf.getSequenceFilesOutputPath();
    this.idField = luceneStorageConf.getIdField();
    this.fields = luceneStorageConf.getFields();
    this.query = luceneStorageConf.getQuery();
    this.maxHits = luceneStorageConf.getMaxHits();
  }

  /**
   * Serializes this object in a Hadoop {@link Configuration}
   *
   * @return a {@link Configuration} object with a String serialization
   * @throws IOException if serialization fails
   */
  public Configuration serialize() throws IOException {
    DefaultStringifier.store(configuration, this, KEY);

    return new Configuration(configuration);
  }

  /**
   * Returns an {@link Iterator} which returns (Text, Text) {@link Pair}s of the produced sequence files.
   *
   * @return iterator
   */
  public Iterator> getSequenceFileIterator() {
    return new SequenceFileDirIterable(sequenceFilesOutputPath, PathType.LIST, PathFilters.logsCRCFilter(),
                                                   configuration).iterator();
  }

  public Configuration getConfiguration() {
    return configuration;
  }

  public Path getSequenceFilesOutputPath() {
    return sequenceFilesOutputPath;
  }

  public List getIndexPaths() {
    return indexPaths;
  }

  public String getIdField() {
    return idField;
  }

  public List getFields() {
    return fields;
  }

  public void setQuery(Query query) {
    this.query = query;
  }

  public Query getQuery() {
    return query;
  }

  public void setMaxHits(int maxHits) {
    this.maxHits = maxHits;
  }

  public int getMaxHits() {
    return maxHits;
  }

  public DocumentStoredFieldVisitor getStoredFieldVisitor() {
    Set fieldSet = new HashSet<>(Collections.singleton(idField));
    fieldSet.addAll(fields);
    return new DocumentStoredFieldVisitor(fieldSet);
  }

  @Override
  public void write(DataOutput out) throws IOException {
    out.writeUTF(sequenceFilesOutputPath.toString());
    out.writeUTF(StringUtils.join(indexPaths, SEPARATOR_PATHS));
    out.writeUTF(idField);
    out.writeUTF(StringUtils.join(fields, SEPARATOR_FIELDS));
    out.writeUTF(query.toString());
    out.writeInt(maxHits);
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    try {
      sequenceFilesOutputPath = new Path(in.readUTF());
      indexPaths = new ArrayList<>();
      String[] indexPaths = in.readUTF().split(SEPARATOR_PATHS);
      for (String indexPath : indexPaths) {
        this.indexPaths.add(new Path(indexPath));
      }
      idField = in.readUTF();
      fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS));
      query = new QueryParser(LUCENE_46, "query", new StandardAnalyzer(LUCENE_46)).parse(in.readUTF());
      maxHits = in.readInt();
    } catch (ParseException e) {
      throw new RuntimeException("Could not deserialize " + this.getClass().getName(), e);
    }
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }

    LuceneStorageConfiguration that = (LuceneStorageConfiguration) o;

    if (maxHits != that.maxHits) {
      return false;
    }
    if (fields != null ? !fields.equals(that.fields) : that.fields != null) {
      return false;
    }
    if (idField != null) {
      if (!idField.equals(that.idField)) {
        return false;
      } else {
        if (indexPaths != null) {
          if (query != null) {
            if (sequenceFilesOutputPath != null) {
              return indexPaths.equals(that.indexPaths) && sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) && query.equals(that.query);
            } else {
              return indexPaths.equals(that.indexPaths) && that.sequenceFilesOutputPath == null && query.equals(that.query);
            }
          } else {
            // query == null
            if (that.query == null && indexPaths.equals(that.indexPaths)) {
              if (sequenceFilesOutputPath != null) {
                return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
              } else {
                return that.sequenceFilesOutputPath == null;
              }
            } else {
              return false;
            }
          }
        } else {
          // indexPaths == null
          if (that.indexPaths == null) {
            if (query != null) {
              if (sequenceFilesOutputPath != null) {
                return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) && query.equals(that.query);
              } else {
                return that.sequenceFilesOutputPath == null && query.equals(that.query);
              }
            } else {
              if (that.query == null) {
                if (sequenceFilesOutputPath != null) {
                  return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
                } else {
                  return that.sequenceFilesOutputPath == null;
                }
              } else {
                return false;
              }
            }
          } else {
            return false;
          }
        }
      }
    } else {
      if (that.idField != null) {
        return false;
      } else {
        if (indexPaths != null) {
          if (query != null) {
            if (sequenceFilesOutputPath != null) {
              return !!indexPaths.equals(that.indexPaths) && !!query.equals(that.query) && !!sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
            } else {
              return !!indexPaths.equals(that.indexPaths) && !!query.equals(that.query) && !(that.sequenceFilesOutputPath != null);
            }
          } else {
            if (sequenceFilesOutputPath != null) {
              return !!indexPaths.equals(that.indexPaths) && !(that.query != null) && !!sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
            } else {
              return !!indexPaths.equals(that.indexPaths) && !(that.query != null) && !(that.sequenceFilesOutputPath != null);
            }
          }
        } else {
          if (query != null) {
            if (sequenceFilesOutputPath != null) {
              return that.indexPaths == null && query.equals(that.query) && sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
            } else {
              return that.indexPaths == null && query.equals(that.query) && that.sequenceFilesOutputPath == null;
            }
          } else {
            return that.indexPaths == null && that.query == null && (sequenceFilesOutputPath != null ? sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) : that.sequenceFilesOutputPath == null);
          }
        }
      }
    }

  }

  @Override
  public int hashCode() {
    int result = indexPaths != null ? indexPaths.hashCode() : 0;
    result = 31 * result + (sequenceFilesOutputPath != null ? sequenceFilesOutputPath.hashCode() : 0);
    result = 31 * result + (idField != null ? idField.hashCode() : 0);
    result = 31 * result + (fields != null ? fields.hashCode() : 0);
    result = 31 * result + (query != null ? query.hashCode() : 0);
    result = 31 * result + maxHits;
    return result;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy