![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.solr.handler.extraction.ExtractingParams Maven / Gradle / Ivy
Show all versions of solr-extraction Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
/** The various Solr Parameters names to use when extracting content. */
public interface ExtractingParams {
/** Map all generated attribute names to field names with lowercase and underscores. */
public static final String LOWERNAMES = "lowernames";
/** if true, ignore TikaException (give up to extract text but index meta data) */
public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
/**
* The param prefix for mapping Tika metadata to Solr fields.
*
* To map a field, add a name like:
*
*
fmap.title=solr.title
*
* In this example, the tika "title" metadata value will be added to a Solr field named
* "solr.title"
*/
public static final String MAP_PREFIX = "fmap.";
/**
* Pass in literal values to be added to the document, as in
*
*
* literal.myField=Foo
*
*/
public static final String LITERALS_PREFIX = "literal.";
/**
* Restrict the extracted parts of a document to be indexed by passing in an XPath expression. All
* content that satisfies the XPath expr. will be passed to the {@link SolrContentHandler}.
*
* See Tika's docs for what the extracted document looks like.
*
* @see #CAPTURE_ELEMENTS
*/
public static final String XPATH_EXPRESSION = "xpath";
/** Only extract and return the content, do not index it. */
public static final String EXTRACT_ONLY = "extractOnly";
/** Content output format if extractOnly is true. Default is "xml", alternative is "text". */
public static final String EXTRACT_FORMAT = "extractFormat";
/**
* Capture attributes separately according to the name of the element, instead of just adding them
* to the string buffer
*/
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
/**
* Literal field values will by default override other values such as metadata and content. Set
* this to false to revert to pre-4.0 behaviour
*/
public static final String LITERALS_OVERRIDE = "literalsOverride";
/**
* Capture the specified fields (and everything included below it that isn't capture by some other
* capture field) separately from the default. This is different then the case of passing in an
* XPath expression.
*
*
The Capture field is based on the localName returned to the {@link SolrContentHandler} by
* Tika, not to be confused by the mapped field. The field name can then be mapped into the index
* schema.
*
*
For instance, a Tika document may look like:
*
*
* <html>
* ...
* <body>
* <p>some text here. <div>more text</div></p>
* Some more text
* </body>
*
*
* By passing in the p tag, you could capture all P tags separately from the rest of the t Thus,
* in the example, the capture of the P tag would be: "some text here. more text"
*/
public static final String CAPTURE_ELEMENTS = "capture";
/** The type of the stream. If not specified, Tika will use mime type detection. */
public static final String STREAM_TYPE = "stream.type";
/**
* Optional. The file name. If specified, Tika can take this into account while guessing the MIME
* type.
*/
public static final String RESOURCE_NAME = "resource.name";
/**
* Optional. The password for this resource. Will be used instead of the rule based password
* lookup mechanisms
*/
public static final String RESOURCE_PASSWORD = "resource.password";
/**
* Optional. If specified, the prefix will be prepended to all Metadata, such that it would be
* possible to setup a dynamic field to automatically capture it
*/
public static final String UNKNOWN_FIELD_PREFIX = "uprefix";
/**
* Optional. If specified and the name of a potential field cannot be determined, the default
* Field specified will be used instead.
*/
public static final String DEFAULT_FIELD = "defaultField";
/**
* Optional. If specified, loads the file as a source for password lookups for Tika encrypted
* documents.
*
* File format is Java properties format with one key=value per line. The key is evaluated as a
* regex against the file name, and the value is the password The rules are evaluated top-bottom,
* i.e. the first match will be used If you want a fallback password to be always used, supply a
* .*=<defaultmypassword> at the end
*/
public static final String PASSWORD_MAP_FILE = "passwordsFile";
}