All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.xml.ElementMetadataHandler Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.xml;

import java.util.Arrays;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;

/**
 * SAX event handler that maps the contents of an XML element into
 * a metadata field.
 *
 * @since Apache Tika 0.10
 */
public class ElementMetadataHandler extends AbstractMetadataHandler {
    private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class);

    private static final String LOCAL_NAME_RDF_BAG = "Bag";
    private static final String LOCAL_NAME_RDF_LI = "li";
    private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";

    private final String uri;

    private final String localName;

    private final Metadata metadata;

    private final String name;
    private Property targetProperty;

    private final boolean allowDuplicateValues;
    private final boolean allowEmptyValues;

    /**
     * The buffer used to capture characters when inside a bag li element.
     */
    private final StringBuilder bufferBagged = new StringBuilder();

    /**
     * The buffer used to capture characters inside standard elements.
     */
    private final StringBuilder bufferBagless = new StringBuilder();

    /**
     * Whether or not the value was found in a standard element structure or inside a bag.
     */
    private boolean isBagless = true;

    private int matchLevel = 0;
    private int parentMatchLevel = 0;

    /**
     * Constructor for string metadata keys.
     *
     * @param uri the uri of the namespace of the element
     * @param localName the local name of the element
     * @param metadata the Tika metadata object to populate
     * @param name the Tika metadata field key
     */
    public ElementMetadataHandler(
            String uri, String localName, Metadata metadata, String name) {
        super(metadata, name);
        this.uri = uri;
        this.localName = localName;
        this.metadata = metadata;
        this.name = name;
        this.allowDuplicateValues = false;
        this.allowEmptyValues = false;
        LOG.trace("created simple handler for {}", this.name);
    }

    /**
     * Constructor for string metadata keys which allows change of behavior
     * for duplicate and empty entry values.
     *
     * @param uri the uri of the namespace of the element
     * @param localName the local name of the element
     * @param metadata the Tika metadata object to populate
     * @param name the Tika metadata field key
     * @param allowDuplicateValues add duplicate values to the Tika metadata
     * @param allowEmptyValues add empty values to the Tika metadata
     */
    public ElementMetadataHandler(
            String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
        super(metadata, name);
        this.uri = uri;
        this.localName = localName;
        this.metadata = metadata;
        this.name = name;
        this.allowDuplicateValues = allowDuplicateValues;
        this.allowEmptyValues = allowEmptyValues;
        LOG.trace("created simple handler for {}", this.name);
    }

    /**
     * Constructor for Property metadata keys.
     *
     * @param uri the uri of the namespace of the element
     * @param localName the local name of the element
     * @param metadata the Tika metadata object to populate
     * @param targetProperty the Tika metadata Property key
     */
    public ElementMetadataHandler(
            String uri, String localName, Metadata metadata, Property targetProperty) {
        super(metadata, targetProperty);
        this.uri = uri;
        this.localName = localName;
        this.metadata = metadata;
        this.targetProperty = targetProperty;
        this.name = targetProperty.getName();
        this.allowDuplicateValues = false;
        this.allowEmptyValues = false;
        LOG.trace("created property handler for {}", this.name);
    }

    /**
     * Constructor for Property metadata keys which allows change of behavior
     * for duplicate and empty entry values.
     *
     * @param uri the uri of the namespace of the element
     * @param localName the local name of the element
     * @param metadata the Tika metadata object to populate
     * @param targetProperty the Tika metadata Property key
     * @param allowDuplicateValues add duplicate values to the Tika metadata
     * @param allowEmptyValues add empty values to the Tika metadata
     */
    public ElementMetadataHandler(
            String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
        super(metadata, targetProperty);
        this.uri = uri;
        this.localName = localName;
        this.metadata = metadata;
        this.targetProperty = targetProperty;
        this.name = targetProperty.getName();
        this.allowDuplicateValues = allowDuplicateValues;
        this.allowEmptyValues = allowEmptyValues;
        LOG.trace("created property handler for {}", this.name);
    }

    protected boolean isMatchingParentElement(String uri, String localName) {
        return (uri.equals(this.uri) && localName.equals(this.localName));
    }

    protected boolean isMatchingElement(String uri, String localName) {
        // match if we're inside the parent element or within some bag element
        return (uri.equals(this.uri) && localName.equals(this.localName)) ||
                (parentMatchLevel > 0 &&
                        ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
                        (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
                )
        );
    }

    @Override
    public void startElement(
            String uri, String localName, String name, Attributes attributes) {
        if (isMatchingElement(uri, localName)) {
            matchLevel++;
        }
        if (isMatchingParentElement(uri, localName)) {
            parentMatchLevel++;
        }
    }

    @Override
    public void endElement(String uri, String localName, String name) {
        if (isMatchingParentElement(uri, localName)) {
            parentMatchLevel--;
        }
        if (isMatchingElement(uri, localName)) {
            matchLevel--;
            if (matchLevel == 2) {
                // we're inside a bag li element, add the bagged buffer
                addMetadata(bufferBagged.toString().trim());
                bufferBagged.setLength(0);
                isBagless = false;
            }
            if (matchLevel == 0 && isBagless) {
                String valueBagless = bufferBagless.toString();
                if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
                    // we're in a standard element, add the bagless buffer
                    addMetadata(valueBagless.trim());
                    bufferBagless.setLength(0);
                }
                isBagless = true;
            }
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) {
        // We need to append to both buffers since we don't if we're inside a bag until we're done
        if (parentMatchLevel > 0 && matchLevel > 2) {
            bufferBagged.append(ch, start, length);
        }
        if (parentMatchLevel > 0 && matchLevel > 0) {
            bufferBagless.append(ch, start, length);
        }
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) {
        characters(ch, start, length);
    }

    @Override
    protected void addMetadata(String value) {
        LOG.trace("adding {}={}", name, value);
        if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
            if ((value != null && value.length() > 0) || allowEmptyValues) {
                if (value == null || value.length() == 0 && allowEmptyValues) {
                    value = "";
                }
                String[] previous = metadata.getValues(name);
                if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
                    metadata.add(targetProperty, value);
                }
            }
        } else {
            super.addMetadata(value);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy