org.apache.tika.parser.microsoft.ExcelExtractor Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft;
import java.awt.Point;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.extractor.OldExcelExtractor;
import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FooterRecord;
import org.apache.poi.hssf.record.FormatRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.HeaderRecord;
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.record.TextObjectRecord;
import org.apache.poi.hssf.record.chart.SeriesTextRecord;
import org.apache.poi.hssf.record.common.UnicodeString;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.DateUtil;
import org.xml.sax.SAXException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Excel parser implementation which uses POI's Event API
* to handle the contents of a Workbook.
*
* The Event API uses a much smaller memory footprint than
* HSSFWorkbook
when processing excel files
* but at the cost of more complexity.
*
* With the Event API a listener is registered for
* specific record types and those records are created,
* fired off to the listener and then discarded as the stream
* is being processed.
*
* @see org.apache.poi.hssf.eventusermodel.HSSFListener
* @see
* POI Event API How To
*/
public class ExcelExtractor extends AbstractPOIFSExtractor {
private static final String BOOK_ENTRY = "Book";
/**
* true
if the HSSFListener should be registered
* to listen for all records or false
(the default)
* if the listener should be configured to only receive specified
* records.
*/
private boolean listenForAllRecords = false;
public ExcelExtractor(ParseContext context, Metadata metadata) {
super(context, metadata);
}
/**
* Looks for one of the variant names for the workbook entry;
* returns null if not found.
*
* @param root directory root to search
* @return workbook entry or null
*/
private static String findWorkbookEntry(DirectoryNode root) {
for (String workbookDirEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
if (root.hasEntry(workbookDirEntryName)) {
return workbookDirEntryName;
}
}
return null;
}
/**
* Returns true
if this parser is configured to listen
* for all records instead of just the specified few.
*/
public boolean isListenForAllRecords() {
return listenForAllRecords;
}
/**
* Specifies whether this parser should to listen for all
* records or just for the specified few.
*
* Note: Under normal operation this setting should
* be false
(the default), but you can experiment with
* this setting for testing and debugging purposes.
*
* @param listenForAllRecords true
if the HSSFListener
* should be registered to listen for all records or
* false
* if the listener should be configured to only receive specified
* records.
*/
public void setListenForAllRecords(boolean listenForAllRecords) {
this.listenForAllRecords = listenForAllRecords;
}
/**
* Extracts text from an Excel Workbook writing the extracted content
* to the specified {@link Appendable}.
*
* @param filesystem POI file system
* @throws IOException if an error occurs processing the workbook
* or writing the extracted content
*/
protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml, Locale locale)
throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml, locale);
}
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale)
throws IOException, SAXException, TikaException {
String workbookEntryName = findWorkbookEntry(root);
if (workbookEntryName == null) {
if (root.hasEntry(BOOK_ENTRY)) {
// Excel 5 / Excel 95 file
// Records are in a different structure so needs a
// different parser to process them
OldExcelExtractor extractor = new OldExcelExtractor(root);
OldExcelParser.parse(extractor, xhtml);
return;
} else {
// Corrupt file / very old file
throw new TikaException("Couldn't find workbook entry");
}
}
// If a password was supplied, use it, otherwise the default
Biff8EncryptionKey.setCurrentUserPassword(getPassword());
// Have the file processed in event mode
TikaHSSFListener listener =
new TikaHSSFListener(workbookEntryName, xhtml, locale, this, officeParserConfig);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
try {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true);
} catch (TikaException e) {
// ignore parse errors from embedded documents
}
}
}
}
// ======================================================================
/**
* HSSF Listener implementation which processes the HSSF records.
*/
private static class TikaHSSFListener implements HSSFListener {
/**
* XHTML content handler to which the document content is rendered.
*/
private final XHTMLContentHandler handler;
/**
* The POIFS Extractor, used for embeded resources.
*/
private final AbstractPOIFSExtractor extractor;
/**
* Format for rendering numbers in the worksheet. Currently we just
* use the platform default formatting.
*
* @see TIKA-103
*/
private final NumberFormat format;
private final OfficeParserConfig officeParserConfig;
private final TikaExcelDataFormatter tikaExcelDataFormatter;
private final String workbookEntryName;
/**
* Potential exception thrown by the content handler. When set to
* non-null
, causes all subsequent HSSF records to be
* ignored and the stored exception to be thrown when
* {@link #throwStoredException()} is invoked.
*/
private Exception exception = null;
private SSTRecord sstRecord;
private FormulaRecord stringFormulaRecord;
private short previousSid;
/**
* Internal FormatTrackingHSSFListener
to handle cell
* formatting within the extraction.
*/
private FormatTrackingHSSFListener formatListener;
/**
* List of worksheet names.
*/
private List sheetNames = new ArrayList<>();
/**
* Index of the current worksheet within the workbook.
* Used to find the worksheet name in the {@link #sheetNames} list.
*/
private short currentSheetIndex;
/**
* Content of the current worksheet, or null
if no
* worksheet is currently active.
*/
private SortedMap currentSheet = null;
/**
* Extra text or cells that crops up, typically as part of a
* worksheet but not always.
*/
private List extraTextCells = new ArrayList<>();
/**
* These aren't complete when we first see them, as the
* depend on continue records that aren't always
* contiguous. Collect them for later processing.
*/
private List drawingGroups = new ArrayList<>();
/**
* Construct a new listener instance outputting parsed data to
* the specified XHTML content handler.
*
* @param handler Destination to write the parsed output to
*/
private TikaHSSFListener(String workbookEntryName, XHTMLContentHandler handler,
Locale locale, AbstractPOIFSExtractor extractor,
OfficeParserConfig officeParserConfig) {
this.workbookEntryName = workbookEntryName;
this.handler = handler;
this.extractor = extractor;
this.format = NumberFormat.getInstance(locale);
this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
this.officeParserConfig = officeParserConfig;
this.tikaExcelDataFormatter
.setDateFormatOverride(officeParserConfig.getDateFormatOverride());
}
/**
* Entry point to listener to start the processing of a file.
*
* @param filesystem POI file system.
* @param listenForAllRecords sets whether the listener is configured to listen
* for all records types or not.
* @throws IOException on any IO errors.
* @throws SAXException on any SAX parsing errors.
*/
public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
throws IOException, SAXException, TikaException {
processFile(filesystem.getRoot(), listenForAllRecords);
}
public void processFile(DirectoryNode root, boolean listenForAllRecords)
throws IOException, SAXException, TikaException {
// Set up listener and register the records we want to process
HSSFRequest hssfRequest = new HSSFRequest();
if (listenForAllRecords) {
hssfRequest.addListenerForAllRecords(formatListener);
} else {
hssfRequest.addListener(formatListener, BOFRecord.sid);
hssfRequest.addListener(formatListener, EOFRecord.sid);
hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
hssfRequest.addListener(formatListener, CountryRecord.sid);
hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
hssfRequest.addListener(formatListener, SSTRecord.sid);
hssfRequest.addListener(formatListener, FormulaRecord.sid);
hssfRequest.addListener(formatListener, LabelRecord.sid);
hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
hssfRequest.addListener(formatListener, NumberRecord.sid);
hssfRequest.addListener(formatListener, RKRecord.sid);
hssfRequest.addListener(formatListener, StringRecord.sid);
hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
hssfRequest.addListener(formatListener, TextObjectRecord.sid);
hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
hssfRequest.addListener(formatListener, HeaderRecord.sid);
hssfRequest.addListener(formatListener, FooterRecord.sid);
}
}
// Create event factory and process Workbook (fire events)
DocumentInputStream documentInputStream =
root.createDocumentInputStream(workbookEntryName);
HSSFEventFactory eventFactory = new HSSFEventFactory();
try {
eventFactory.processEvents(hssfRequest, documentInputStream);
} catch (org.apache.poi.EncryptedDocumentException e) {
throw new EncryptedDocumentException(e);
}
// Output any extra text that came after all the sheets
processExtraText();
// Look for embeded images, now that the drawing records
// have been fully matched with their continue data
for (DrawingGroupRecord dgr : drawingGroups) {
dgr.decode();
findPictures(dgr.getEscherRecords());
}
}
/**
* Process a HSSF record.
*
* @param record HSSF Record
*/
public void processRecord(Record record) {
if (exception == null) {
try {
internalProcessRecord(record);
} catch (TikaException | IOException | SAXException e) {
exception = e;
}
}
}
public void throwStoredException() throws TikaException, SAXException, IOException {
if (exception != null) {
if (exception instanceof IOException) {
throw (IOException) exception;
}
if (exception instanceof SAXException) {
throw (SAXException) exception;
}
if (exception instanceof TikaException) {
throw (TikaException) exception;
}
throw new TikaException(exception.getMessage());
}
}
private void internalProcessRecord(Record record)
throws SAXException, TikaException, IOException {
switch (record.getSid()) {
case BOFRecord.sid: // start of workbook, worksheet etc. records
BOFRecord bof = (BOFRecord) record;
if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
currentSheetIndex = -1;
} else if (bof.getType() == BOFRecord.TYPE_CHART) {
if (previousSid == EOFRecord.sid) {
// This is a sheet which contains only a chart
newSheet();
} else {
// This is a chart within a normal sheet
// Handling of this is a bit hacky...
if (currentSheet != null) {
processSheet();
currentSheetIndex--;
newSheet();
}
}
} else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
newSheet();
}
break;
case EOFRecord.sid: // end of workbook, worksheet etc. records
if (currentSheet != null) {
processSheet();
}
currentSheet = null;
break;
case BoundSheetRecord.sid: // Worksheet index record
BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
sheetNames.add(boundSheetRecord.getSheetname());
break;
case SSTRecord.sid: // holds all the strings for LabelSSTRecords
sstRecord = (SSTRecord) record;
break;
case FormulaRecord.sid: // Cell value from a formula
FormulaRecord formula = (FormulaRecord) record;
if (formula.hasCachedResultString()) {
// The String itself should be the next record
stringFormulaRecord = formula;
} else {
addTextCell(record, formatListener.formatNumberDateCell(formula));
}
break;
case StringRecord.sid:
if (previousSid == FormulaRecord.sid) {
// Cached string value of a string formula
StringRecord sr = (StringRecord) record;
addTextCell(stringFormulaRecord, sr.getString());
} else {
// Some other string not associated with a cell, skip
}
break;
case LabelRecord.sid: // strings stored directly in the cell
LabelRecord label = (LabelRecord) record;
addTextCell(record, label.getValue());
break;
case LabelSSTRecord.sid: // Ref. a string in the shared string table
LabelSSTRecord sst = (LabelSSTRecord) record;
UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
String cellString = null;
if (officeParserConfig.isConcatenatePhoneticRuns()) {
String phonetic = (unicode != null && unicode.getExtendedRst() != null &&
unicode.getExtendedRst().getPhoneticText() != null &&
unicode.getExtendedRst().getPhoneticText().trim().length() > 0) ?
unicode.getExtendedRst().getPhoneticText() : "";
cellString = unicode.getString() + " " + phonetic;
} else {
cellString = unicode.getString();
}
addTextCell(record, cellString);
break;
case NumberRecord.sid: // Contains a numeric cell value
NumberRecord number = (NumberRecord) record;
addTextCell(record, formatListener.formatNumberDateCell(number));
break;
case RKRecord.sid: // Excel internal number record
RKRecord rk = (RKRecord) record;
addCell(record, new NumberCell(rk.getRKNumber(), format));
break;
case HyperlinkRecord.sid: // holds a URL associated with a cell
if (currentSheet != null) {
HyperlinkRecord link = (HyperlinkRecord) record;
Point point = new Point(link.getFirstColumn(), link.getFirstRow());
Cell cell = currentSheet.get(point);
if (cell != null) {
String address = link.getAddress();
if (address != null) {
addCell(record, new LinkedCell(cell, address));
} else {
addCell(record, cell);
}
}
}
break;
case TextObjectRecord.sid:
if (extractor.officeParserConfig.isIncludeShapeBasedContent()) {
TextObjectRecord tor = (TextObjectRecord) record;
addTextCell(record, tor.getStr().getString());
}
break;
case SeriesTextRecord.sid: // Chart label or title
SeriesTextRecord str = (SeriesTextRecord) record;
addTextCell(record, str.getText());
break;
case DrawingGroupRecord.sid:
// Collect this now, we'll process later when all
// the continue records are in
drawingGroups.add((DrawingGroupRecord) record);
break;
case HeaderRecord.sid:
if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
HeaderRecord headerRecord = (HeaderRecord) record;
addTextCell(record, headerRecord.getText());
}
break;
case FooterRecord.sid:
if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
FooterRecord footerRecord = (FooterRecord) record;
addTextCell(record, footerRecord.getText());
}
break;
}
previousSid = record.getSid();
if (stringFormulaRecord != record) {
stringFormulaRecord = null;
}
}
private void processExtraText() throws SAXException {
if (extraTextCells.size() > 0) {
for (Cell cell : extraTextCells) {
handler.startElement("div", "class", "outside");
cell.render(handler);
handler.endElement("div");
}
// Reset
extraTextCells.clear();
}
}
/**
* Adds the given cell (unless null ) to the current
* worksheet (if any) at the position (if any) of the given record.
*
* @param record record that holds the cell value
* @param cell cell value (or null )
*/
private void addCell(Record record, Cell cell) throws SAXException {
if (cell == null) {
// Ignore empty cells
} else if (currentSheet != null && record instanceof CellValueRecordInterface) {
// Normal cell inside a worksheet
CellValueRecordInterface value = (CellValueRecordInterface) record;
Point point = new Point(value.getColumn(), value.getRow());
if (currentSheet.containsKey(point)) {
//avoid overwriting content
//for now, add to extraTextCells
//TODO: consider allowing multiple text pieces
//per x,y to keep the text together
extraTextCells.add(cell);
} else {
currentSheet.put(point, cell);
}
} else {
// Cell outside the worksheets
extraTextCells.add(cell);
}
}
/**
* Adds a text cell with the given text comment. The given text
* is trimmed, and ignored if null or empty.
*
* @param record record that holds the text value
* @param text text content, may be null
* @throws SAXException
*/
private void addTextCell(Record record, String text) throws SAXException {
if (text != null) {
text = text.trim();
if (text.length() > 0) {
addCell(record, new TextCell(text));
}
}
}
private void newSheet() {
currentSheetIndex++;
currentSheet = new TreeMap<>(new PointComparator());
}
/**
* Process an excel sheet.
*
* @throws SAXException if an error occurs
*/
private void processSheet() throws SAXException {
// Sheet Start
handler.startElement("div", "class", "page");
if (currentSheetIndex < sheetNames.size()) {
handler.element("h1", sheetNames.get(currentSheetIndex));
}
handler.startElement("table");
handler.startElement("tbody");
// Process Rows
int currentRow = 0;
int currentColumn = 0;
handler.startElement("tr");
handler.startElement("td");
for (Map.Entry entry : currentSheet.entrySet()) {
if (currentRow != entry.getKey().y) {
// We've moved onto a new row, possibly skipping some
do {
handler.endElement("td");
handler.endElement("tr");
handler.startElement("tr");
handler.startElement("td");
currentRow++;
} while (officeParserConfig.isIncludeMissingRows() &&
currentRow < entry.getKey().y);
currentRow = entry.getKey().y;
currentColumn = 0;
}
while (currentColumn < entry.getKey().x) {
handler.endElement("td");
handler.startElement("td");
currentColumn++;
}
entry.getValue().render(handler);
}
handler.endElement("td");
handler.endElement("tr");
// Sheet End
handler.endElement("tbody");
handler.endElement("table");
// Finish up
processExtraText();
handler.endElement("div");
}
private void findPictures(List records)
throws IOException, SAXException, TikaException {
for (EscherRecord escherRecord : records) {
if (escherRecord instanceof EscherBSERecord) {
EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
if (blip != null) {
HSSFPictureData picture = new HSSFPictureData(blip);
String mimeType = picture.getMimeType();
TikaInputStream stream = TikaInputStream.get(picture.getData());
// Handle the embeded resource
extractor.handleEmbeddedResource(stream, null, null, mimeType, handler,
true);
}
}
// Recursive call.
findPictures(escherRecord.getChildRecords());
}
}
private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener {
public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) {
super(childListener, locale);
}
@Override
public void processRecord(Record record) {
// System.out.println(record.getClass() + " : "+record.toString());
super.processRecord(record);
}
@Override
public String formatNumberDateCell(CellValueRecordInterface cell) {
String formatString = this.getFormatString(cell);
double value;
if (cell instanceof NumberRecord) {
value = ((NumberRecord) cell).getValue();
} else {
if (!(cell instanceof FormulaRecord)) {
throw new IllegalArgumentException(
"Unsupported CellValue Record passed in " + cell);
}
value = ((FormulaRecord) cell).getValue();
}
if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) {
return tikaExcelDataFormatter
.formatRawCellContents(value, getFormatIndex(cell), formatString,
false);
} else if ("general".equalsIgnoreCase(formatString)) {
return tikaExcelDataFormatter
.formatRawCellContents(value, getFormatIndex(cell), formatString,
false);
}
return super.formatNumberDateCell(cell);
}
}
}
/**
* Utility comparator for points.
*/
private static class PointComparator implements Comparator {
public int compare(Point a, Point b) {
int diff = a.y - b.y;
if (diff == 0) {
diff = a.x - b.x;
}
return diff;
}
}
}
|
© 2015 - 2025 Weber Informatics LLC | Privacy Policy