All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.ooxml.MetadataExtractor Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;

import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;

import java.math.BigDecimal;
import java.util.Date;
import java.util.Optional;

/**
 * OOXML metadata extractor.
 * 

* Currently POI doesn't support metadata extraction for OOXML. * * @see OOXMLExtractor#getMetadataExtractor() */ public class MetadataExtractor { private final POIXMLTextExtractor extractor; public MetadataExtractor(POIXMLTextExtractor extractor) { this.extractor = extractor; } public void extract(Metadata metadata) throws TikaException { if (extractor.getDocument() != null || ((extractor instanceof XSSFEventBasedExcelExtractor || extractor instanceof XWPFEventBasedWordExtractor || extractor instanceof XSLFEventBasedPowerPointExtractor || extractor instanceof XPSTextExtractor) && extractor.getPackage() != null)) { extractMetadata(extractor.getCoreProperties(), metadata); extractMetadata(extractor.getExtendedProperties(), metadata); extractMetadata(extractor.getCustomProperties(), metadata); } } private void extractMetadata(POIXMLProperties.CoreProperties properties, Metadata metadata) { PackagePropertiesPart propsHolder = properties .getUnderlyingProperties(); setProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty()); setProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder .getContentStatusProperty()); setProperty(metadata, TikaCoreProperties.CREATED, propsHolder .getCreatedProperty()); addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder .getCreatorProperty()); setProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder .getDescriptionProperty()); setProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder .getIdentifierProperty()); addProperty(metadata, OfficeOpenXMLCore.SUBJECT, propsHolder.getSubjectProperty()); addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder .getKeywordsProperty()); setProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder .getLanguageProperty()); setProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder .getLastModifiedByProperty()); setProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder .getLastPrintedProperty()); setProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder .getModifiedProperty()); setProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder .getRevisionProperty()); // TODO: Move to OO subject in Tika 2.0 setProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, propsHolder.getSubjectProperty()); setProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty()); setProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty()); // Legacy Tika-1.0 style stats // TODO Remove these in Tika 2.0 setProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty()); setProperty(metadata, Metadata.CONTENT_STATUS, propsHolder .getContentStatusProperty()); setProperty(metadata, Metadata.REVISION_NUMBER, propsHolder .getRevisionProperty()); setProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty()); } private void extractMetadata(POIXMLProperties.ExtendedProperties properties, Metadata metadata) { CTProperties propsHolder = properties.getUnderlyingProperties(); //TIKA-2055, some ooxml files can include unsigned int/long values //which cause this exception. //For now, catch it and record as '0' because //Word converts to '0' on save. int totalTime = 0; try { totalTime = propsHolder.getTotalTime(); } catch (XmlValueOutOfRangeException e) { //swallow for now } setProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication()); setProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion()); setProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany()); setProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany()); SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager()); setProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes()); setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate()); setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime); int docSecurityFlag = propsHolder.getDocSecurity(); setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY, docSecurityFlag); setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY_STRING, getDocSecurityString(docSecurityFlag)); if (propsHolder.getPages() > 0) { metadata.set(PagedText.N_PAGES, propsHolder.getPages()); } else if (propsHolder.getSlides() > 0) { metadata.set(PagedText.N_PAGES, propsHolder.getSlides()); } // Process the document statistics setProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages()); setProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides()); setProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs()); setProperty(metadata, Office.LINE_COUNT, propsHolder.getLines()); setProperty(metadata, Office.WORD_COUNT, propsHolder.getWords()); setProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters()); setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); // Legacy Tika-1.0 style stats // TODO Remove these in Tika 2.0 setProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication()); setProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion()); setProperty(metadata, Metadata.MANAGER, propsHolder.getManager()); setProperty(metadata, Metadata.NOTES, propsHolder.getNotes()); setProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); setProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate()); setProperty(metadata, Metadata.TOTAL_TIME, totalTime); setProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages()); setProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides()); setProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs()); setProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines()); setProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords()); setProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters()); setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); } private String getDocSecurityString(int docSecurityFlag) { //mappings from: https://exiftool.org/TagNames/OOXML.html and //https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.extendedproperties.documentsecurity?view=openxml-2.8.1 switch(docSecurityFlag) { case 0: return OfficeOpenXMLExtended.SECURITY_NONE; case 1: return OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED; case 2: return OfficeOpenXMLExtended.SECURITY_READ_ONLY_RECOMMENDED; case 4: return OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED; case 8: return OfficeOpenXMLExtended.SECURITY_LOCKED_FOR_ANNOTATIONS; default: return OfficeOpenXMLExtended.SECURITY_UNKNOWN; } } private void extractMetadata(POIXMLProperties.CustomProperties properties, Metadata metadata) { org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties props = properties.getUnderlyingProperties(); for (int i = 0; i < props.sizeOfPropertyArray(); i++) { CTProperty property = props.getPropertyArray(i); String val = null; Date date = null; if (property.isSetLpwstr()) { val = property.getLpwstr(); } else if (property.isSetLpstr()) { val = property.getLpstr(); } else if (property.isSetDate()) { date = property.getDate().getTime(); } else if (property.isSetFiletime()) { date = property.getFiletime().getTime(); } else if (property.isSetBool()) { val = Boolean.toString(property.getBool()); } // Integers else if (property.isSetI1()) { val = Integer.toString(property.getI1()); } else if (property.isSetI2()) { val = Integer.toString(property.getI2()); } else if (property.isSetI4()) { val = Integer.toString(property.getI4()); } else if (property.isSetI8()) { val = Long.toString(property.getI8()); } else if (property.isSetInt()) { val = Integer.toString(property.getInt()); } // Unsigned Integers else if (property.isSetUi1()) { val = Integer.toString(property.getUi1()); } else if (property.isSetUi2()) { val = Integer.toString(property.getUi2()); } else if (property.isSetUi4()) { val = Long.toString(property.getUi4()); } else if (property.isSetUi8()) { val = property.getUi8().toString(); } else if (property.isSetUint()) { val = Long.toString(property.getUint()); } // Reals else if (property.isSetR4()) { val = Float.toString(property.getR4()); } else if (property.isSetR8()) { val = Double.toString(property.getR8()); } else if (property.isSetDecimal()) { BigDecimal d = property.getDecimal(); if (d == null) { val = null; } else { val = d.toPlainString(); } } else if (property.isSetArray()) { // TODO Fetch the array values and output } else if (property.isSetVector()) { // TODO Fetch the vector values and output } else if (property.isSetBlob() || property.isSetOblob()) { // TODO Decode, if possible } else if (property.isSetStream() || property.isSetOstream() || property.isSetVstream()) { // TODO Decode, if possible } else if (property.isSetStorage() || property.isSetOstorage()) { // TODO Decode, if possible } else { // This type isn't currently supported yet, skip the property } String propName = "custom:" + property.getName(); if (date != null) { Property tikaProp = Property.externalDate(propName); metadata.set(tikaProp, date); } else if (val != null) { metadata.set(propName, val); } } } private void setProperty(Metadata metadata, Property property, Optional nullableValue) { if (!nullableValue.isPresent()) { return; } T value = nullableValue.get(); if (value instanceof Date) { metadata.set(property, (Date) value); } else if (value instanceof String) { metadata.set(property, (String) value); } else if (value instanceof Integer) { metadata.set(property, (Integer) value); } else if (value instanceof Double) { metadata.set(property, (Double) value); } } private void addProperty(Metadata metadata, Property property, Optional nullableValue) { if (!nullableValue.isPresent()) { return; } T value = nullableValue.get(); if (value instanceof String) { metadata.add(property, (String) value); } else { throw new IllegalArgumentException("Can't add property of class: " + nullableValue.getClass()); } } private void setProperty(Metadata metadata, String property, Optional nullableValue) { if (!nullableValue.isPresent()) { return; } String value = nullableValue.get(); metadata.set(property, value); } private void setProperty(Metadata metadata, Property property, String value) { if (value != null) { metadata.set(property, value); } } private void setProperty(Metadata metadata, String name, String value) { if (value != null) { metadata.set(name, value); } } private void setProperty(Metadata metadata, Property property, int value) { if (value > 0) { metadata.set(property, value); } } private void setProperty(Metadata metadata, String name, int value) { if (value > 0) { metadata.set(name, Integer.toString(value)); } } private void addMultiProperty(Metadata metadata, Property property, Optional value) { if (!value.isPresent()) { return; } SummaryExtractor.addMulti(metadata, property, value.get()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy