org.apache.tika.parser.microsoft.ooxml.MetadataExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import java.math.BigDecimal;
import java.util.Date;
import java.util.Optional;
/**
* OOXML metadata extractor.
*
* Currently POI doesn't support metadata extraction for OOXML.
*
* @see OOXMLExtractor#getMetadataExtractor()
*/
public class MetadataExtractor {
private final POIXMLTextExtractor extractor;
public MetadataExtractor(POIXMLTextExtractor extractor) {
this.extractor = extractor;
}
public void extract(Metadata metadata) throws TikaException {
if (extractor.getDocument() != null ||
((extractor instanceof XSSFEventBasedExcelExtractor ||
extractor instanceof XWPFEventBasedWordExtractor ||
extractor instanceof XSLFEventBasedPowerPointExtractor ||
extractor instanceof XPSTextExtractor) &&
extractor.getPackage() != null)) {
extractMetadata(extractor.getCoreProperties(), metadata);
extractMetadata(extractor.getExtendedProperties(), metadata);
extractMetadata(extractor.getCustomProperties(), metadata);
}
}
private void extractMetadata(POIXMLProperties.CoreProperties properties, Metadata metadata) {
PackagePropertiesPart propsHolder = properties
.getUnderlyingProperties();
setProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty());
setProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder
.getContentStatusProperty());
setProperty(metadata, TikaCoreProperties.CREATED, propsHolder
.getCreatedProperty());
addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
.getCreatorProperty());
setProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
.getDescriptionProperty());
setProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder
.getIdentifierProperty());
addProperty(metadata, OfficeOpenXMLCore.SUBJECT,
propsHolder.getSubjectProperty());
addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
.getKeywordsProperty());
setProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
.getLanguageProperty());
setProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder
.getLastModifiedByProperty());
setProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder
.getLastPrintedProperty());
setProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
.getModifiedProperty());
setProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
.getRevisionProperty());
// TODO: Move to OO subject in Tika 2.0
setProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
propsHolder.getSubjectProperty());
setProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
setProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
// Legacy Tika-1.0 style stats
// TODO Remove these in Tika 2.0
setProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
setProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
.getContentStatusProperty());
setProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
.getRevisionProperty());
setProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
}
private void extractMetadata(POIXMLProperties.ExtendedProperties properties,
Metadata metadata) {
CTProperties propsHolder = properties.getUnderlyingProperties();
//TIKA-2055, some ooxml files can include unsigned int/long values
//which cause this exception.
//For now, catch it and record as '0' because
//Word converts to '0' on save.
int totalTime = 0;
try {
totalTime = propsHolder.getTotalTime();
} catch (XmlValueOutOfRangeException e) {
//swallow for now
}
setProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
setProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
setProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
setProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany());
SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager());
setProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
int docSecurityFlag = propsHolder.getDocSecurity();
setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY, docSecurityFlag);
setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY_STRING,
getDocSecurityString(docSecurityFlag));
if (propsHolder.getPages() > 0) {
metadata.set(PagedText.N_PAGES, propsHolder.getPages());
} else if (propsHolder.getSlides() > 0) {
metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
}
// Process the document statistics
setProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages());
setProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides());
setProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs());
setProperty(metadata, Office.LINE_COUNT, propsHolder.getLines());
setProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
setProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
// Legacy Tika-1.0 style stats
// TODO Remove these in Tika 2.0
setProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
setProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
setProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
setProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
setProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
setProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
setProperty(metadata, Metadata.TOTAL_TIME, totalTime);
setProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
setProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
setProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
setProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
setProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
setProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
}
private String getDocSecurityString(int docSecurityFlag) {
//mappings from: https://exiftool.org/TagNames/OOXML.html and
//https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.extendedproperties.documentsecurity?view=openxml-2.8.1
switch(docSecurityFlag) {
case 0:
return OfficeOpenXMLExtended.SECURITY_NONE;
case 1:
return OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED;
case 2:
return OfficeOpenXMLExtended.SECURITY_READ_ONLY_RECOMMENDED;
case 4:
return OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED;
case 8:
return OfficeOpenXMLExtended.SECURITY_LOCKED_FOR_ANNOTATIONS;
default:
return OfficeOpenXMLExtended.SECURITY_UNKNOWN;
}
}
private void extractMetadata(POIXMLProperties.CustomProperties properties,
Metadata metadata) {
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
props = properties.getUnderlyingProperties();
for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
CTProperty property = props.getPropertyArray(i);
String val = null;
Date date = null;
if (property.isSetLpwstr()) {
val = property.getLpwstr();
} else if (property.isSetLpstr()) {
val = property.getLpstr();
} else if (property.isSetDate()) {
date = property.getDate().getTime();
} else if (property.isSetFiletime()) {
date = property.getFiletime().getTime();
} else if (property.isSetBool()) {
val = Boolean.toString(property.getBool());
}
// Integers
else if (property.isSetI1()) {
val = Integer.toString(property.getI1());
} else if (property.isSetI2()) {
val = Integer.toString(property.getI2());
} else if (property.isSetI4()) {
val = Integer.toString(property.getI4());
} else if (property.isSetI8()) {
val = Long.toString(property.getI8());
} else if (property.isSetInt()) {
val = Integer.toString(property.getInt());
}
// Unsigned Integers
else if (property.isSetUi1()) {
val = Integer.toString(property.getUi1());
} else if (property.isSetUi2()) {
val = Integer.toString(property.getUi2());
} else if (property.isSetUi4()) {
val = Long.toString(property.getUi4());
} else if (property.isSetUi8()) {
val = property.getUi8().toString();
} else if (property.isSetUint()) {
val = Long.toString(property.getUint());
}
// Reals
else if (property.isSetR4()) {
val = Float.toString(property.getR4());
} else if (property.isSetR8()) {
val = Double.toString(property.getR8());
} else if (property.isSetDecimal()) {
BigDecimal d = property.getDecimal();
if (d == null) {
val = null;
} else {
val = d.toPlainString();
}
} else if (property.isSetArray()) {
// TODO Fetch the array values and output
} else if (property.isSetVector()) {
// TODO Fetch the vector values and output
} else if (property.isSetBlob() || property.isSetOblob()) {
// TODO Decode, if possible
} else if (property.isSetStream() || property.isSetOstream() ||
property.isSetVstream()) {
// TODO Decode, if possible
} else if (property.isSetStorage() || property.isSetOstorage()) {
// TODO Decode, if possible
} else {
// This type isn't currently supported yet, skip the property
}
String propName = "custom:" + property.getName();
if (date != null) {
Property tikaProp = Property.externalDate(propName);
metadata.set(tikaProp, date);
} else if (val != null) {
metadata.set(propName, val);
}
}
}
private void setProperty(Metadata metadata, Property property, Optional nullableValue) {
if (!nullableValue.isPresent()) {
return;
}
T value = nullableValue.get();
if (value instanceof Date) {
metadata.set(property, (Date) value);
} else if (value instanceof String) {
metadata.set(property, (String) value);
} else if (value instanceof Integer) {
metadata.set(property, (Integer) value);
} else if (value instanceof Double) {
metadata.set(property, (Double) value);
}
}
private void addProperty(Metadata metadata, Property property, Optional nullableValue) {
if (!nullableValue.isPresent()) {
return;
}
T value = nullableValue.get();
if (value instanceof String) {
metadata.add(property, (String) value);
} else {
throw new IllegalArgumentException("Can't add property of class: " + nullableValue.getClass());
}
}
private void setProperty(Metadata metadata, String property, Optional nullableValue) {
if (!nullableValue.isPresent()) {
return;
}
String value = nullableValue.get();
metadata.set(property, value);
}
private void setProperty(Metadata metadata, Property property, String value) {
if (value != null) {
metadata.set(property, value);
}
}
private void setProperty(Metadata metadata, String name, String value) {
if (value != null) {
metadata.set(name, value);
}
}
private void setProperty(Metadata metadata, Property property, int value) {
if (value > 0) {
metadata.set(property, value);
}
}
private void setProperty(Metadata metadata, String name, int value) {
if (value > 0) {
metadata.set(name, Integer.toString(value));
}
}
private void addMultiProperty(Metadata metadata, Property property, Optional value) {
if (!value.isPresent()) {
return;
}
SummaryExtractor.addMulti(metadata, property, value.get());
}
}