org.apache.tika.parser.xml.ElementMetadataHandler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
/**
* SAX event handler that maps the contents of an XML element into
* a metadata field.
*
* @since Apache Tika 0.10
*/
public class ElementMetadataHandler extends AbstractMetadataHandler {
/**
* Logger for this class
*/
private static final Log logger = LogFactory
.getLog(ElementMetadataHandler.class);
private static final String LOCAL_NAME_RDF_BAG = "Bag";
private static final String LOCAL_NAME_RDF_LI = "li";
private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
private final String uri;
private final String localName;
private final Metadata metadata;
private final String name;
private Property targetProperty;
private final boolean allowDuplicateValues;
private final boolean allowEmptyValues;
/**
* The buffer used to capture characters when inside a bag li element.
*/
private final StringBuilder bufferBagged = new StringBuilder();
/**
* The buffer used to capture characters inside standard elements.
*/
private final StringBuilder bufferBagless = new StringBuilder();
/**
* Whether or not the value was found in a standard element structure or inside a bag.
*/
private boolean isBagless = true;
private int matchLevel = 0;
private int parentMatchLevel = 0;
/**
* Constructor for string metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.name = name;
this.allowDuplicateValues = false;
this.allowEmptyValues = false;
if (logger.isTraceEnabled()) {
logger.trace("created simple handler for " + this.name);
}
}
/**
* Constructor for string metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.name = name;
this.allowDuplicateValues = allowDuplicateValues;
this.allowEmptyValues = allowEmptyValues;
if (logger.isTraceEnabled()) {
logger.trace("created simple handler for " + this.name);
}
}
/**
* Constructor for Property metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
this.allowDuplicateValues = false;
this.allowEmptyValues = false;
if (logger.isTraceEnabled()) {
logger.trace("created property handler for " + this.name);
}
}
/**
* Constructor for Property metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
this.allowDuplicateValues = allowDuplicateValues;
this.allowEmptyValues = allowEmptyValues;
if (logger.isTraceEnabled()) {
logger.trace("created property handler for " + this.name);
}
}
protected boolean isMatchingParentElement(String uri, String localName) {
return (uri.equals(this.uri) && localName.equals(this.localName));
}
protected boolean isMatchingElement(String uri, String localName) {
// match if we're inside the parent element or within some bag element
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
(parentMatchLevel > 0 &&
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
)
);
}
@Override
public void startElement(
String uri, String localName, String name, Attributes attributes) {
if (isMatchingElement(uri, localName)) {
matchLevel++;
}
if (isMatchingParentElement(uri, localName)) {
parentMatchLevel++;
}
}
@Override
public void endElement(String uri, String localName, String name) {
if (isMatchingParentElement(uri, localName)) {
parentMatchLevel--;
}
if (isMatchingElement(uri, localName)) {
matchLevel--;
if (matchLevel == 2) {
// we're inside a bag li element, add the bagged buffer
addMetadata(bufferBagged.toString().trim());
bufferBagged.setLength(0);
isBagless = false;
}
if (matchLevel == 0 && isBagless) {
String valueBagless = bufferBagless.toString();
if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
// we're in a standard element, add the bagless buffer
addMetadata(valueBagless.trim());
bufferBagless.setLength(0);
}
isBagless = true;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
// We need to append to both buffers since we don't if we're inside a bag until we're done
if (parentMatchLevel > 0 && matchLevel > 2) {
bufferBagged.append(ch, start, length);
}
if (parentMatchLevel > 0 && matchLevel > 0) {
bufferBagless.append(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) {
characters(ch, start, length);
}
@Override
protected void addMetadata(String value) {
if (logger.isTraceEnabled()) {
logger.trace("adding " + name + "=" + value);
}
if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
if ((value != null && value.length() > 0) || allowEmptyValues) {
if (value == null || value.length() == 0 && allowEmptyValues) {
value = "";
}
String[] previous = metadata.getValues(name);
if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}
} else {
super.addMetadata(value);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy