All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.ExcelExtractor Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import java.awt.Point;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.extractor.OldExcelExtractor;
import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FooterRecord;
import org.apache.poi.hssf.record.FormatRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.HeaderRecord;
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.record.TextObjectRecord;
import org.apache.poi.hssf.record.chart.SeriesTextRecord;
import org.apache.poi.hssf.record.common.UnicodeString;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.DateUtil;
import org.xml.sax.SAXException;

import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * Excel parser implementation which uses POI's Event API
 * to handle the contents of a Workbook.
 * 

* The Event API uses a much smaller memory footprint than * HSSFWorkbook when processing excel files * but at the cost of more complexity. *

* With the Event API a listener is registered for * specific record types and those records are created, * fired off to the listener and then discarded as the stream * is being processed. * * @see org.apache.poi.hssf.eventusermodel.HSSFListener * @see * POI Event API How To */ public class ExcelExtractor extends AbstractPOIFSExtractor { private static final String BOOK_ENTRY = "Book"; /** * true if the HSSFListener should be registered * to listen for all records or false (the default) * if the listener should be configured to only receive specified * records. */ private boolean listenForAllRecords = false; public ExcelExtractor(ParseContext context, Metadata metadata) { super(context, metadata); } /** * Looks for one of the variant names for the workbook entry; * returns null if not found. * * @param root directory root to search * @return workbook entry or null */ private static String findWorkbookEntry(DirectoryNode root) { for (String workbookDirEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) { if (root.hasEntry(workbookDirEntryName)) { return workbookDirEntryName; } } return null; } /** * Returns true if this parser is configured to listen * for all records instead of just the specified few. */ public boolean isListenForAllRecords() { return listenForAllRecords; } /** * Specifies whether this parser should to listen for all * records or just for the specified few. *

* Note: Under normal operation this setting should * be false (the default), but you can experiment with * this setting for testing and debugging purposes. * * @param listenForAllRecords true if the HSSFListener * should be registered to listen for all records or * false * if the listener should be configured to only receive specified * records. */ public void setListenForAllRecords(boolean listenForAllRecords) { this.listenForAllRecords = listenForAllRecords; } /** * Extracts text from an Excel Workbook writing the extracted content * to the specified {@link Appendable}. * * @param filesystem POI file system * @throws IOException if an error occurs processing the workbook * or writing the extracted content */ protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml, locale); } protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { String workbookEntryName = findWorkbookEntry(root); if (workbookEntryName == null) { if (root.hasEntry(BOOK_ENTRY)) { // Excel 5 / Excel 95 file // Records are in a different structure so needs a // different parser to process them OldExcelExtractor extractor = new OldExcelExtractor(root); OldExcelParser.parse(extractor, xhtml); return; } else { // Corrupt file / very old file throw new TikaException("Couldn't find workbook entry"); } } // If a password was supplied, use it, otherwise the default Biff8EncryptionKey.setCurrentUserPassword(getPassword()); // Have the file processed in event mode TikaHSSFListener listener = new TikaHSSFListener(workbookEntryName, xhtml, locale, this, officeParserConfig); listener.processFile(root, isListenForAllRecords()); listener.throwStoredException(); for (Entry entry : root) { if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) { try { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true); } catch (TikaException e) { // ignore parse errors from embedded documents } } } } // ====================================================================== /** * HSSF Listener implementation which processes the HSSF records. */ private static class TikaHSSFListener implements HSSFListener { /** * XHTML content handler to which the document content is rendered. */ private final XHTMLContentHandler handler; /** * The POIFS Extractor, used for embeded resources. */ private final AbstractPOIFSExtractor extractor; /** * Format for rendering numbers in the worksheet. Currently we just * use the platform default formatting. * * @see TIKA-103 */ private final NumberFormat format; private final OfficeParserConfig officeParserConfig; private final TikaExcelDataFormatter tikaExcelDataFormatter; private final String workbookEntryName; /** * Potential exception thrown by the content handler. When set to * non-null, causes all subsequent HSSF records to be * ignored and the stored exception to be thrown when * {@link #throwStoredException()} is invoked. */ private Exception exception = null; private SSTRecord sstRecord; private FormulaRecord stringFormulaRecord; private short previousSid; /** * Internal FormatTrackingHSSFListener to handle cell * formatting within the extraction. */ private FormatTrackingHSSFListener formatListener; /** * List of worksheet names. */ private List sheetNames = new ArrayList<>(); /** * Index of the current worksheet within the workbook. * Used to find the worksheet name in the {@link #sheetNames} list. */ private short currentSheetIndex; /** * Content of the current worksheet, or null if no * worksheet is currently active. */ private SortedMap currentSheet = null; /** * Extra text or cells that crops up, typically as part of a * worksheet but not always. */ private List extraTextCells = new ArrayList<>(); /** * These aren't complete when we first see them, as the * depend on continue records that aren't always * contiguous. Collect them for later processing. */ private List drawingGroups = new ArrayList<>(); /** * Construct a new listener instance outputting parsed data to * the specified XHTML content handler. * * @param handler Destination to write the parsed output to */ private TikaHSSFListener(String workbookEntryName, XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor, OfficeParserConfig officeParserConfig) { this.workbookEntryName = workbookEntryName; this.handler = handler; this.extractor = extractor; this.format = NumberFormat.getInstance(locale); this.formatListener = new TikaFormatTrackingHSSFListener(this, locale); this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale); this.officeParserConfig = officeParserConfig; this.tikaExcelDataFormatter .setDateFormatOverride(officeParserConfig.getDateFormatOverride()); } /** * Entry point to listener to start the processing of a file. * * @param filesystem POI file system. * @param listenForAllRecords sets whether the listener is configured to listen * for all records types or not. * @throws IOException on any IO errors. * @throws SAXException on any SAX parsing errors. */ public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords) throws IOException, SAXException, TikaException { processFile(filesystem.getRoot(), listenForAllRecords); } public void processFile(DirectoryNode root, boolean listenForAllRecords) throws IOException, SAXException, TikaException { // Set up listener and register the records we want to process HSSFRequest hssfRequest = new HSSFRequest(); if (listenForAllRecords) { hssfRequest.addListenerForAllRecords(formatListener); } else { hssfRequest.addListener(formatListener, BOFRecord.sid); hssfRequest.addListener(formatListener, EOFRecord.sid); hssfRequest.addListener(formatListener, DateWindow1904Record.sid); hssfRequest.addListener(formatListener, CountryRecord.sid); hssfRequest.addListener(formatListener, BoundSheetRecord.sid); hssfRequest.addListener(formatListener, SSTRecord.sid); hssfRequest.addListener(formatListener, FormulaRecord.sid); hssfRequest.addListener(formatListener, LabelRecord.sid); hssfRequest.addListener(formatListener, LabelSSTRecord.sid); hssfRequest.addListener(formatListener, NumberRecord.sid); hssfRequest.addListener(formatListener, RKRecord.sid); hssfRequest.addListener(formatListener, StringRecord.sid); hssfRequest.addListener(formatListener, HyperlinkRecord.sid); hssfRequest.addListener(formatListener, TextObjectRecord.sid); hssfRequest.addListener(formatListener, SeriesTextRecord.sid); hssfRequest.addListener(formatListener, FormatRecord.sid); hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid); hssfRequest.addListener(formatListener, DrawingGroupRecord.sid); if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) { hssfRequest.addListener(formatListener, HeaderRecord.sid); hssfRequest.addListener(formatListener, FooterRecord.sid); } } // Create event factory and process Workbook (fire events) DocumentInputStream documentInputStream = root.createDocumentInputStream(workbookEntryName); HSSFEventFactory eventFactory = new HSSFEventFactory(); try { eventFactory.processEvents(hssfRequest, documentInputStream); } catch (org.apache.poi.EncryptedDocumentException e) { throw new EncryptedDocumentException(e); } // Output any extra text that came after all the sheets processExtraText(); // Look for embeded images, now that the drawing records // have been fully matched with their continue data for (DrawingGroupRecord dgr : drawingGroups) { dgr.decode(); findPictures(dgr.getEscherRecords()); } } /** * Process a HSSF record. * * @param record HSSF Record */ public void processRecord(Record record) { if (exception == null) { try { internalProcessRecord(record); } catch (TikaException | IOException | SAXException e) { exception = e; } } } public void throwStoredException() throws TikaException, SAXException, IOException { if (exception != null) { if (exception instanceof IOException) { throw (IOException) exception; } if (exception instanceof SAXException) { throw (SAXException) exception; } if (exception instanceof TikaException) { throw (TikaException) exception; } throw new TikaException(exception.getMessage()); } } private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException { switch (record.getSid()) { case BOFRecord.sid: // start of workbook, worksheet etc. records BOFRecord bof = (BOFRecord) record; if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { currentSheetIndex = -1; } else if (bof.getType() == BOFRecord.TYPE_CHART) { if (previousSid == EOFRecord.sid) { // This is a sheet which contains only a chart newSheet(); } else { // This is a chart within a normal sheet // Handling of this is a bit hacky... if (currentSheet != null) { processSheet(); currentSheetIndex--; newSheet(); } } } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { newSheet(); } break; case EOFRecord.sid: // end of workbook, worksheet etc. records if (currentSheet != null) { processSheet(); } currentSheet = null; break; case BoundSheetRecord.sid: // Worksheet index record BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; sheetNames.add(boundSheetRecord.getSheetname()); break; case SSTRecord.sid: // holds all the strings for LabelSSTRecords sstRecord = (SSTRecord) record; break; case FormulaRecord.sid: // Cell value from a formula FormulaRecord formula = (FormulaRecord) record; if (formula.hasCachedResultString()) { // The String itself should be the next record stringFormulaRecord = formula; } else { addTextCell(record, formatListener.formatNumberDateCell(formula)); } break; case StringRecord.sid: if (previousSid == FormulaRecord.sid) { // Cached string value of a string formula StringRecord sr = (StringRecord) record; addTextCell(stringFormulaRecord, sr.getString()); } else { // Some other string not associated with a cell, skip } break; case LabelRecord.sid: // strings stored directly in the cell LabelRecord label = (LabelRecord) record; addTextCell(record, label.getValue()); break; case LabelSSTRecord.sid: // Ref. a string in the shared string table LabelSSTRecord sst = (LabelSSTRecord) record; UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); String cellString = null; if (officeParserConfig.isConcatenatePhoneticRuns()) { String phonetic = (unicode != null && unicode.getExtendedRst() != null && unicode.getExtendedRst().getPhoneticText() != null && unicode.getExtendedRst().getPhoneticText().trim().length() > 0) ? unicode.getExtendedRst().getPhoneticText() : ""; cellString = unicode.getString() + " " + phonetic; } else { cellString = unicode.getString(); } addTextCell(record, cellString); break; case NumberRecord.sid: // Contains a numeric cell value NumberRecord number = (NumberRecord) record; addTextCell(record, formatListener.formatNumberDateCell(number)); break; case RKRecord.sid: // Excel internal number record RKRecord rk = (RKRecord) record; addCell(record, new NumberCell(rk.getRKNumber(), format)); break; case HyperlinkRecord.sid: // holds a URL associated with a cell if (currentSheet != null) { HyperlinkRecord link = (HyperlinkRecord) record; Point point = new Point(link.getFirstColumn(), link.getFirstRow()); Cell cell = currentSheet.get(point); if (cell != null) { String address = link.getAddress(); if (address != null) { addCell(record, new LinkedCell(cell, address)); } else { addCell(record, cell); } } } break; case TextObjectRecord.sid: if (extractor.officeParserConfig.isIncludeShapeBasedContent()) { TextObjectRecord tor = (TextObjectRecord) record; addTextCell(record, tor.getStr().getString()); } break; case SeriesTextRecord.sid: // Chart label or title SeriesTextRecord str = (SeriesTextRecord) record; addTextCell(record, str.getText()); break; case DrawingGroupRecord.sid: // Collect this now, we'll process later when all // the continue records are in drawingGroups.add((DrawingGroupRecord) record); break; case HeaderRecord.sid: if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) { HeaderRecord headerRecord = (HeaderRecord) record; addTextCell(record, headerRecord.getText()); } break; case FooterRecord.sid: if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) { FooterRecord footerRecord = (FooterRecord) record; addTextCell(record, footerRecord.getText()); } break; } previousSid = record.getSid(); if (stringFormulaRecord != record) { stringFormulaRecord = null; } } private void processExtraText() throws SAXException { if (extraTextCells.size() > 0) { for (Cell cell : extraTextCells) { handler.startElement("div", "class", "outside"); cell.render(handler); handler.endElement("div"); } // Reset extraTextCells.clear(); } } /** * Adds the given cell (unless null) to the current * worksheet (if any) at the position (if any) of the given record. * * @param record record that holds the cell value * @param cell cell value (or null) */ private void addCell(Record record, Cell cell) throws SAXException { if (cell == null) { // Ignore empty cells } else if (currentSheet != null && record instanceof CellValueRecordInterface) { // Normal cell inside a worksheet CellValueRecordInterface value = (CellValueRecordInterface) record; Point point = new Point(value.getColumn(), value.getRow()); if (currentSheet.containsKey(point)) { //avoid overwriting content //for now, add to extraTextCells //TODO: consider allowing multiple text pieces //per x,y to keep the text together extraTextCells.add(cell); } else { currentSheet.put(point, cell); } } else { // Cell outside the worksheets extraTextCells.add(cell); } } /** * Adds a text cell with the given text comment. The given text * is trimmed, and ignored if null or empty. * * @param record record that holds the text value * @param text text content, may be null * @throws SAXException */ private void addTextCell(Record record, String text) throws SAXException { if (text != null) { text = text.trim(); if (text.length() > 0) { addCell(record, new TextCell(text)); } } } private void newSheet() { currentSheetIndex++; currentSheet = new TreeMap<>(new PointComparator()); } /** * Process an excel sheet. * * @throws SAXException if an error occurs */ private void processSheet() throws SAXException { // Sheet Start handler.startElement("div", "class", "page"); if (currentSheetIndex < sheetNames.size()) { handler.element("h1", sheetNames.get(currentSheetIndex)); } handler.startElement("table"); handler.startElement("tbody"); // Process Rows int currentRow = 0; int currentColumn = 0; handler.startElement("tr"); handler.startElement("td"); for (Map.Entry entry : currentSheet.entrySet()) { if (currentRow != entry.getKey().y) { // We've moved onto a new row, possibly skipping some do { handler.endElement("td"); handler.endElement("tr"); handler.startElement("tr"); handler.startElement("td"); currentRow++; } while (officeParserConfig.isIncludeMissingRows() && currentRow < entry.getKey().y); currentRow = entry.getKey().y; currentColumn = 0; } while (currentColumn < entry.getKey().x) { handler.endElement("td"); handler.startElement("td"); currentColumn++; } entry.getValue().render(handler); } handler.endElement("td"); handler.endElement("tr"); // Sheet End handler.endElement("tbody"); handler.endElement("table"); // Finish up processExtraText(); handler.endElement("div"); } private void findPictures(List records) throws IOException, SAXException, TikaException { for (EscherRecord escherRecord : records) { if (escherRecord instanceof EscherBSERecord) { EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); if (blip != null) { HSSFPictureData picture = new HSSFPictureData(blip); String mimeType = picture.getMimeType(); TikaInputStream stream = TikaInputStream.get(picture.getData()); // Handle the embeded resource extractor.handleEmbeddedResource(stream, null, null, mimeType, handler, true); } } // Recursive call. findPictures(escherRecord.getChildRecords()); } } private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener { public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) { super(childListener, locale); } @Override public void processRecord(Record record) { // System.out.println(record.getClass() + " : "+record.toString()); super.processRecord(record); } @Override public String formatNumberDateCell(CellValueRecordInterface cell) { String formatString = this.getFormatString(cell); double value; if (cell instanceof NumberRecord) { value = ((NumberRecord) cell).getValue(); } else { if (!(cell instanceof FormulaRecord)) { throw new IllegalArgumentException( "Unsupported CellValue Record passed in " + cell); } value = ((FormulaRecord) cell).getValue(); } if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) { return tikaExcelDataFormatter .formatRawCellContents(value, getFormatIndex(cell), formatString, false); } else if ("general".equalsIgnoreCase(formatString)) { return tikaExcelDataFormatter .formatRawCellContents(value, getFormatIndex(cell), formatString, false); } return super.formatNumberDateCell(cell); } } } /** * Utility comparator for points. */ private static class PointComparator implements Comparator { public int compare(Point a, Point b) { int diff = a.y - b.y; if (diff == 0) { diff = a.x - b.x; } return diff; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy