All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.RecursiveParserWrapperHandler Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.LinkedList;
import java.util.List;

/**
 * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}.
 * See its documentation for more details.
 *
 * This caches the a metadata object for each embedded file and for the container file.
 * It places the extracted content in the metadata object, with this key: {@link AbstractRecursiveParserWrapperHandler#TIKA_CONTENT}
 * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each
 * embedded document.
 * 

* NOTE: This handler must only be used with the {@link org.apache.tika.parser.RecursiveParserWrapper} *

*/ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler { protected final List metadataList = new LinkedList<>(); /** * Create a handler with no limit on the number of embedded resources */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { super(contentHandlerFactory); } /** * Create a handler that limits the number of embedded resources that will be * parsed * @param maxEmbeddedResources number of embedded resources that will be parsed */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { super(contentHandlerFactory, maxEmbeddedResources); } /** * This is called before parsing an embedded document * * @param contentHandler - local content handler to use on the embedded document * @param metadata metadata to use for the embedded document * @throws SAXException */ @Override public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.startEmbeddedDocument(contentHandler, metadata); } /** * This is called after parsing an embedded document. * @param contentHandler local contenthandler used on the embedded document * @param metadata metadata from the embedded document * @throws SAXException */ @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); metadataList.add(ParserUtils.cloneMetadata(metadata)); } /** * * @param contentHandler content handler used on the main document * @param metadata metadata from the main document * @throws SAXException */ @Override public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endDocument(contentHandler, metadata); addContent(contentHandler, metadata); metadataList.add(0, ParserUtils.cloneMetadata(metadata)); } /** * * @return a list of Metadata objects, one for the main document and one for each embedded document */ public List getMetadataList() { return metadataList; } void addContent(ContentHandler handler, Metadata metadata) { if (handler.getClass().equals(DefaultHandler.class)){ //no-op: we can't rely on just testing for //empty content because DefaultHandler's toString() //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" } else { String content = handler.toString(); if (content != null && content.trim().length() > 0 ) { metadata.add(TIKA_CONTENT, content); metadata.add(TIKA_CONTENT_HANDLER, handler.getClass().getSimpleName()); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy