All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.extraction.ExtractingRequestHandler Maven / Gradle / Ivy

Go to download

Apache Solr Content Extraction Library integrates Apache Tika content extraction framework into Solr

There is a newer version: 8.11.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.extraction;


import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.loader.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeTypeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;


/**
 * Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
 * first from the document.
 * 

*/ public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware { private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class); public static final String CONFIG_LOCATION = "tika.config"; public static final String DATE_FORMATS = "date.formats"; protected TikaConfig config; protected Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS; protected SolrContentHandlerFactory factory; @Override public void init(NamedList args) { super.init(args); } @Override public void inform(SolrCore core) { if (initArgs != null) { //if relative,then relative to config dir, otherwise, absolute path String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); if (tikaConfigLoc != null) { File configFile = new File(tikaConfigLoc); if (configFile.isAbsolute() == false) { configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath()); } try { config = new TikaConfig(configFile); } catch (Exception e) { throw new SolrException(ErrorCode.SERVER_ERROR, e); } } NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS); if (configDateFormats != null && configDateFormats.size() > 0) { dateFormats = new HashSet(); Iterator it = configDateFormats.iterator(); while (it.hasNext()) { String format = (String) it.next().getValue(); log.info("Adding Date Format: " + format); dateFormats.add(format); } } } if (config == null) { try { config = getDefaultConfig(core.getResourceLoader().getClassLoader()); } catch (MimeTypeException e) { throw new SolrException(ErrorCode.SERVER_ERROR, e); } catch (IOException e) { throw new SolrException(ErrorCode.SERVER_ERROR, e); } } factory = createFactory(); } private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException { return new TikaConfig(classLoader); } protected SolrContentHandlerFactory createFactory() { return new SolrContentHandlerFactory(dateFormats); } @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { return new ExtractingDocumentLoader(req, processor, config, factory); } // ////////////////////// SolrInfoMBeans methods ////////////////////// @Override public String getDescription() { return "Add/Update Rich document"; } @Override public String getSource() { return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_4_5/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java $"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy