org.apache.mahout.text.LuceneStorageConfiguration Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-integration Show documentation
Show all versions of mahout-integration Show documentation
Optional components of Mahout which generally support interaction with third party systems,
formats, APIs, etc.
package org.apache.mahout.text;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.google.common.base.Preconditions;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import static org.apache.lucene.util.Version.LUCENE_46;
/**
* Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, which generates a sequence file
* with id as the key and a content field as value.
*/
public class LuceneStorageConfiguration implements Writable {
private static final Query DEFAULT_QUERY = new MatchAllDocsQuery();
private static final int DEFAULT_MAX_HITS = Integer.MAX_VALUE;
static final String KEY = "org.apache.mahout.text.LuceneIndexToSequenceFiles";
static final String SEPARATOR_FIELDS = ",";
static final String SEPARATOR_PATHS = ",";
private Configuration configuration;
private List indexPaths;
private Path sequenceFilesOutputPath;
private String idField;
private List fields;
private Query query;
private int maxHits;
/**
* Create a configuration bean with all mandatory parameters.
*
* @param configuration Hadoop configuration for writing sequencefiles
* @param indexPaths paths to the index
* @param sequenceFilesOutputPath path to output the sequence file
* @param idField field used for the key of the sequence file
* @param fields field(s) used for the value of the sequence file
*/
public LuceneStorageConfiguration(Configuration configuration, List indexPaths, Path sequenceFilesOutputPath,
String idField, List fields) {
Preconditions.checkArgument(configuration != null, "Parameter 'configuration' cannot be null");
Preconditions.checkArgument(indexPaths != null, "Parameter 'indexPaths' cannot be null");
Preconditions.checkArgument(indexPaths != null && !indexPaths.isEmpty(), "Parameter 'indexPaths' cannot be empty");
Preconditions.checkArgument(sequenceFilesOutputPath != null, "Parameter 'sequenceFilesOutputPath' cannot be null");
Preconditions.checkArgument(idField != null, "Parameter 'idField' cannot be null");
Preconditions.checkArgument(fields != null, "Parameter 'fields' cannot be null");
Preconditions.checkArgument(fields != null && !fields.isEmpty(), "Parameter 'fields' cannot be empty");
this.configuration = configuration;
this.indexPaths = indexPaths;
this.sequenceFilesOutputPath = sequenceFilesOutputPath;
this.idField = idField;
this.fields = fields;
this.query = DEFAULT_QUERY;
this.maxHits = DEFAULT_MAX_HITS;
}
public LuceneStorageConfiguration() {
// Used during serialization. Do not use.
}
/**
* Deserializes a {@link LuceneStorageConfiguration} from a {@link Configuration}.
*
* @param conf the {@link Configuration} object with a serialized {@link LuceneStorageConfiguration}
* @throws IOException if deserialization fails
*/
public LuceneStorageConfiguration(Configuration conf) throws IOException {
Preconditions.checkNotNull(conf, "Parameter 'configuration' cannot be null");
String serializedConfigString = conf.get(KEY);
if (serializedConfigString == null) {
throw new IllegalArgumentException("Parameter 'configuration' does not contain a serialized " + this.getClass());
}
LuceneStorageConfiguration luceneStorageConf = DefaultStringifier.load(conf, KEY, LuceneStorageConfiguration.class);
this.configuration = conf;
this.indexPaths = luceneStorageConf.getIndexPaths();
this.sequenceFilesOutputPath = luceneStorageConf.getSequenceFilesOutputPath();
this.idField = luceneStorageConf.getIdField();
this.fields = luceneStorageConf.getFields();
this.query = luceneStorageConf.getQuery();
this.maxHits = luceneStorageConf.getMaxHits();
}
/**
* Serializes this object in a Hadoop {@link Configuration}
*
* @return a {@link Configuration} object with a String serialization
* @throws IOException if serialization fails
*/
public Configuration serialize() throws IOException {
DefaultStringifier.store(configuration, this, KEY);
return new Configuration(configuration);
}
/**
* Returns an {@link Iterator} which returns (Text, Text) {@link Pair}s of the produced sequence files.
*
* @return iterator
*/
public Iterator> getSequenceFileIterator() {
return new SequenceFileDirIterable(sequenceFilesOutputPath, PathType.LIST, PathFilters.logsCRCFilter(),
configuration).iterator();
}
public Configuration getConfiguration() {
return configuration;
}
public Path getSequenceFilesOutputPath() {
return sequenceFilesOutputPath;
}
public List getIndexPaths() {
return indexPaths;
}
public String getIdField() {
return idField;
}
public List getFields() {
return fields;
}
public void setQuery(Query query) {
this.query = query;
}
public Query getQuery() {
return query;
}
public void setMaxHits(int maxHits) {
this.maxHits = maxHits;
}
public int getMaxHits() {
return maxHits;
}
public DocumentStoredFieldVisitor getStoredFieldVisitor() {
Set fieldSet = new HashSet<>(Collections.singleton(idField));
fieldSet.addAll(fields);
return new DocumentStoredFieldVisitor(fieldSet);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(sequenceFilesOutputPath.toString());
out.writeUTF(StringUtils.join(indexPaths, SEPARATOR_PATHS));
out.writeUTF(idField);
out.writeUTF(StringUtils.join(fields, SEPARATOR_FIELDS));
out.writeUTF(query.toString());
out.writeInt(maxHits);
}
@Override
public void readFields(DataInput in) throws IOException {
try {
sequenceFilesOutputPath = new Path(in.readUTF());
indexPaths = new ArrayList<>();
String[] indexPaths = in.readUTF().split(SEPARATOR_PATHS);
for (String indexPath : indexPaths) {
this.indexPaths.add(new Path(indexPath));
}
idField = in.readUTF();
fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS));
query = new QueryParser(LUCENE_46, "query", new StandardAnalyzer(LUCENE_46)).parse(in.readUTF());
maxHits = in.readInt();
} catch (ParseException e) {
throw new RuntimeException("Could not deserialize " + this.getClass().getName(), e);
}
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
LuceneStorageConfiguration that = (LuceneStorageConfiguration) o;
if (maxHits != that.maxHits) {
return false;
}
if (fields != null ? !fields.equals(that.fields) : that.fields != null) {
return false;
}
if (idField != null) {
if (!idField.equals(that.idField)) {
return false;
} else {
if (indexPaths != null) {
if (query != null) {
if (sequenceFilesOutputPath != null) {
return indexPaths.equals(that.indexPaths) && sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) && query.equals(that.query);
} else {
return indexPaths.equals(that.indexPaths) && that.sequenceFilesOutputPath == null && query.equals(that.query);
}
} else {
// query == null
if (that.query == null && indexPaths.equals(that.indexPaths)) {
if (sequenceFilesOutputPath != null) {
return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
} else {
return that.sequenceFilesOutputPath == null;
}
} else {
return false;
}
}
} else {
// indexPaths == null
if (that.indexPaths == null) {
if (query != null) {
if (sequenceFilesOutputPath != null) {
return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) && query.equals(that.query);
} else {
return that.sequenceFilesOutputPath == null && query.equals(that.query);
}
} else {
if (that.query == null) {
if (sequenceFilesOutputPath != null) {
return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
} else {
return that.sequenceFilesOutputPath == null;
}
} else {
return false;
}
}
} else {
return false;
}
}
}
} else {
if (that.idField != null) {
return false;
} else {
if (indexPaths != null) {
if (query != null) {
if (sequenceFilesOutputPath != null) {
return !!indexPaths.equals(that.indexPaths) && !!query.equals(that.query) && !!sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
} else {
return !!indexPaths.equals(that.indexPaths) && !!query.equals(that.query) && !(that.sequenceFilesOutputPath != null);
}
} else {
if (sequenceFilesOutputPath != null) {
return !!indexPaths.equals(that.indexPaths) && !(that.query != null) && !!sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
} else {
return !!indexPaths.equals(that.indexPaths) && !(that.query != null) && !(that.sequenceFilesOutputPath != null);
}
}
} else {
if (query != null) {
if (sequenceFilesOutputPath != null) {
return that.indexPaths == null && query.equals(that.query) && sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath);
} else {
return that.indexPaths == null && query.equals(that.query) && that.sequenceFilesOutputPath == null;
}
} else {
return that.indexPaths == null && that.query == null && (sequenceFilesOutputPath != null ? sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) : that.sequenceFilesOutputPath == null);
}
}
}
}
}
@Override
public int hashCode() {
int result = indexPaths != null ? indexPaths.hashCode() : 0;
result = 31 * result + (sequenceFilesOutputPath != null ? sequenceFilesOutputPath.hashCode() : 0);
result = 31 * result + (idField != null ? idField.hashCode() : 0);
result = 31 * result + (fields != null ? fields.hashCode() : 0);
result = 31 * result + (query != null ? query.hashCode() : 0);
result = 31 * result + maxHits;
return result;
}
}