All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver Maven / Gradle / Ivy

Go to download

Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.

There is a newer version: 0.13.0
Show newest version
package org.apache.mahout.text;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.commandline.DefaultOptionCreator;

/**
 * Driver class for the lucene2seq program. Converts text contents of stored fields of a lucene index into a Hadoop
 * SequenceFile. The key of the sequence file is the document ID and the value is the concatenated text of the specified
 * stored field(s).
 */
public class SequenceFilesFromLuceneStorageDriver extends AbstractJob {

  static final String OPTION_ID_FIELD = "idField";
  static final String OPTION_FIELD = "fields";
  static final String OPTION_QUERY = "query";
  static final String OPTION_MAX_HITS = "maxHits";

  static final Query DEFAULT_QUERY = new MatchAllDocsQuery();
  static final int DEFAULT_MAX_HITS = Integer.MAX_VALUE;

  static final String SEPARATOR_FIELDS = ",";
  static final String QUERY_DELIMITER = "'";
  private static final Pattern COMPILE = Pattern.compile(QUERY_DELIMITER);

  public static void main(String[] args) throws Exception {
    ToolRunner.run(new SequenceFilesFromLuceneStorageDriver(), args);
  }

  @Override
  public int run(String[] args) throws Exception {
    addOutputOption();
    addInputOption();
    //addOption(OPTION_LUCENE_DIRECTORY, "d", "Lucene directory / directories. Comma separated.", true);
    addOption(OPTION_ID_FIELD, "id", "The field in the index containing the id", true);
    addOption(OPTION_FIELD, "f", "The stored field(s) in the index containing text", true);

    addOption(OPTION_QUERY, "q", "(Optional) Lucene query. Defaults to " + DEFAULT_QUERY.getClass().getSimpleName());
    addOption(OPTION_MAX_HITS, "n", "(Optional) Max hits. Defaults to " + DEFAULT_MAX_HITS);
    addOption(DefaultOptionCreator.methodOption().create());

    if (parseArguments(args) == null) {
      return -1;
    }

    Configuration configuration = getConf();

    String[] paths = getInputPath().toString().split(",");
    List indexPaths = new ArrayList<>();
    for (String path : paths) {
      indexPaths.add(new Path(path));
    }

    Path sequenceFilesOutputPath = getOutputPath();

    String idField = getOption(OPTION_ID_FIELD);
    String fields = getOption(OPTION_FIELD);

    LuceneStorageConfiguration lucene2SeqConf = newLucene2SeqConfiguration(configuration,
            indexPaths,
            sequenceFilesOutputPath,
            idField,
            Arrays.asList(fields.split(SEPARATOR_FIELDS)));

    Query query = DEFAULT_QUERY;
    if (hasOption(OPTION_QUERY)) {
      try {
        String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll("");
        QueryParser queryParser = new QueryParser(Version.LUCENE_46, queryString,
            new StandardAnalyzer(Version.LUCENE_46));
        query = queryParser.parse(queryString);
      } catch (ParseException e) {
        throw new IllegalArgumentException(e.getMessage(), e);
      }
    }
    lucene2SeqConf.setQuery(query);

    int maxHits = DEFAULT_MAX_HITS;
    if (hasOption(OPTION_MAX_HITS)) {
      String maxHitsString = getOption(OPTION_MAX_HITS);
      maxHits = Integer.valueOf(maxHitsString);
    }
    lucene2SeqConf.setMaxHits(maxHits);

    if (hasOption(DefaultOptionCreator.METHOD_OPTION)
        && getOption(DefaultOptionCreator.METHOD_OPTION).equals("sequential")) {
      new SequenceFilesFromLuceneStorage().run(lucene2SeqConf);
    } else {
      new SequenceFilesFromLuceneStorageMRJob().run(lucene2SeqConf);
    }
    return 0;
  }

  public LuceneStorageConfiguration newLucene2SeqConfiguration(Configuration configuration,
                                                               List indexPaths,
                                                               Path sequenceFilesOutputPath,
                                                               String idField,
                                                               List fields) {
    return new LuceneStorageConfiguration(
            configuration,
            indexPaths,
            sequenceFilesOutputPath,
            idField,
            fields);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy