org.apache.solr.handler.extraction.ExtractingRequestHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-cell Show documentation
Show all versions of solr-cell Show documentation
Apache Solr Content Extraction Library integrates Apache Tika
content extraction framework into Solr
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.loader.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeTypeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
/**
* Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
* first from the document.
*
*/
public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware {
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
public static final String CONFIG_LOCATION = "tika.config";
public static final String DATE_FORMATS = "date.formats";
protected TikaConfig config;
protected Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
protected SolrContentHandlerFactory factory;
@Override
public void init(NamedList args) {
super.init(args);
}
@Override
public void inform(SolrCore core) {
if (initArgs != null) {
//if relative,then relative to config dir, otherwise, absolute path
String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
if (tikaConfigLoc != null) {
File configFile = new File(tikaConfigLoc);
if (configFile.isAbsolute() == false) {
configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath());
}
try {
config = new TikaConfig(configFile);
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
dateFormats = new HashSet();
Iterator it = configDateFormats.iterator();
while (it.hasNext()) {
String format = (String) it.next().getValue();
log.info("Adding Date Format: " + format);
dateFormats.add(format);
}
}
}
if (config == null) {
try {
config = getDefaultConfig(core.getResourceLoader().getClassLoader());
} catch (MimeTypeException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
factory = createFactory();
}
private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
return new TikaConfig(classLoader);
}
protected SolrContentHandlerFactory createFactory() {
return new SolrContentHandlerFactory(dateFormats);
}
@Override
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
return new ExtractingDocumentLoader(req, processor, config, factory);
}
// ////////////////////// SolrInfoMBeans methods //////////////////////
@Override
public String getDescription() {
return "Add/Update Rich document";
}
@Override
public String getSource() {
return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_4_5/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java $";
}
}