com.monitorjbl.xlsx.impl.StreamingSheetReader Maven / Gradle / Ivy
package com.monitorjbl.xlsx.impl;
import com.monitorjbl.xlsx.exceptions.CloseException;
import com.monitorjbl.xlsx.exceptions.ParseException;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
public class StreamingSheetReader implements Iterable {
private static final Logger log = LoggerFactory.getLogger(StreamingSheetReader.class);
private final SharedStringsTable sst;
private final StylesTable stylesTable;
private final XMLEventReader parser;
private final DataFormatter dataFormatter = new DataFormatter();
private final Set hiddenColumns = new HashSet<>();
private int lastRowNum;
private int currentRowNum;
private int firstColNum = 0;
private int currentColNum;
private int rowCacheSize;
private List rowCache = new ArrayList<>();
private Iterator rowCacheIterator;
private String lastContents;
private StreamingRow currentRow;
private StreamingCell currentCell;
private boolean use1904Dates;
public StreamingSheetReader(SharedStringsTable sst, StylesTable stylesTable, XMLEventReader parser, final boolean use1904Dates, int rowCacheSize) {
this.sst = sst;
this.stylesTable = stylesTable;
this.parser = parser;
this.use1904Dates = use1904Dates;
this.rowCacheSize = rowCacheSize;
}
/**
* Read through a number of rows equal to the rowCacheSize field or until there is no more data to read
*
* @return true if data was read
*/
private boolean getRow() {
try {
rowCache.clear();
while(rowCache.size() < rowCacheSize && parser.hasNext()) {
handleEvent(parser.nextEvent());
}
rowCacheIterator = rowCache.iterator();
return rowCacheIterator.hasNext();
} catch(XMLStreamException | SAXException e) {
throw new ParseException("Error reading XML stream", e);
}
}
private String[] splitCellRef(String ref) {
int splitPos = -1;
// start at pos 1, since the first char is expected to always be a letter
for(int i=1;i= '0' && c <= '9') {
splitPos = i;
break;
}
}
return new String[] {
ref.substring(0, splitPos),
ref.substring(splitPos)
};
}
/**
* Handles a SAX event.
*
* @param event
* @throws SAXException
*/
private void handleEvent(XMLEvent event) throws SAXException {
if(event.getEventType() == XMLStreamConstants.CHARACTERS) {
Characters c = event.asCharacters();
lastContents += c.getData();
} else if(event.getEventType() == XMLStreamConstants.START_ELEMENT
&& isSpreadsheetTag(event.asStartElement().getName())) {
StartElement startElement = event.asStartElement();
String tagLocalName = startElement.getName().getLocalPart();
if("row".equals(tagLocalName)) {
Attribute rowNumAttr = startElement.getAttributeByName(new QName("r"));
int rowIndex = currentRowNum;
if(rowNumAttr != null) {
rowIndex = Integer.parseInt(rowNumAttr.getValue()) - 1;
currentRowNum = rowIndex;
}
Attribute isHiddenAttr = startElement.getAttributeByName(new QName("hidden"));
boolean isHidden = isHiddenAttr != null && ("1".equals(isHiddenAttr.getValue()) || "true".equals(isHiddenAttr.getValue()));
currentRow = new StreamingRow(rowIndex, isHidden);
currentColNum = firstColNum;
} else if("col".equals(tagLocalName)) {
Attribute isHiddenAttr = startElement.getAttributeByName(new QName("hidden"));
boolean isHidden = isHiddenAttr != null && ("1".equals(isHiddenAttr.getValue()) || "true".equals(isHiddenAttr.getValue()));
if(isHidden) {
Attribute minAttr = startElement.getAttributeByName(new QName("min"));
Attribute maxAttr = startElement.getAttributeByName(new QName("max"));
int min = Integer.parseInt(minAttr.getValue()) - 1;
int max = Integer.parseInt(maxAttr.getValue()) - 1;
for(int columnIndex = min; columnIndex <= max; columnIndex++)
hiddenColumns.add(columnIndex);
}
} else if("c".equals(tagLocalName)) {
Attribute ref = startElement.getAttributeByName(new QName("r"));
if(ref != null) {
String[] coord = splitCellRef(ref.getValue());
currentCell = new StreamingCell(CellReference.convertColStringToIndex(coord[0]), Integer.parseInt(coord[1]) - 1, use1904Dates);
} else {
currentCell = new StreamingCell(currentColNum, currentRowNum, use1904Dates);
}
setFormatString(startElement, currentCell);
Attribute type = startElement.getAttributeByName(new QName("t"));
if(type != null) {
currentCell.setType(type.getValue());
} else {
currentCell.setType("n");
}
Attribute style = startElement.getAttributeByName(new QName("s"));
if(style != null) {
String indexStr = style.getValue();
try {
int index = Integer.parseInt(indexStr);
currentCell.setCellStyle(stylesTable.getStyleAt(index));
} catch(NumberFormatException nfe) {
log.warn("Ignoring invalid style index {}", indexStr);
}
} else {
currentCell.setCellStyle(stylesTable.getStyleAt(0));
}
} else if("dimension".equals(tagLocalName)) {
Attribute refAttr = startElement.getAttributeByName(new QName("ref"));
String ref = refAttr != null ? refAttr.getValue() : null;
if(ref != null) {
// ref is formatted as A1 or A1:F25. Take the last numbers of this string and use it as lastRowNum
for(int i = ref.length() - 1; i >= 0; i--) {
if(!Character.isDigit(ref.charAt(i))) {
try {
lastRowNum = Integer.parseInt(ref.substring(i + 1)) - 1;
} catch(NumberFormatException ignore) { }
break;
}
}
for(int i = 0; i < ref.length(); i++) {
if(!Character.isAlphabetic(ref.charAt(i))) {
firstColNum = CellReference.convertColStringToIndex(ref.substring(0, i));
break;
}
}
}
} else if("f".equals(tagLocalName)) {
if (currentCell != null) {
currentCell.setFormulaType(true);
}
}
// Clear contents cache
lastContents = "";
} else if(event.getEventType() == XMLStreamConstants.END_ELEMENT
&& isSpreadsheetTag(event.asEndElement().getName())) {
EndElement endElement = event.asEndElement();
String tagLocalName = endElement.getName().getLocalPart();
if("v".equals(tagLocalName) || "t".equals(tagLocalName)) {
currentCell.setRawContents(unformattedContents());
currentCell.setContentSupplier(formattedContents());
} else if("row".equals(tagLocalName) && currentRow != null) {
rowCache.add(currentRow);
currentRowNum++;
} else if("c".equals(tagLocalName)) {
currentRow.getCellMap().put(currentCell.getColumnIndex(), currentCell);
currentCell = null;
currentColNum++;
} else if("f".equals(tagLocalName)) {
if (currentCell != null) {
currentCell.setFormula(lastContents);
}
}
}
}
/**
* Returns true if a tag is part of the main namespace for SpreadsheetML:
*
* - http://schemas.openxmlformats.org/spreadsheetml/2006/main
*
- http://purl.oclc.org/ooxml/spreadsheetml/main
*
* As opposed to http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing, etc.
*
* @param name
* @return
*/
private boolean isSpreadsheetTag(QName name) {
return (name.getNamespaceURI() != null
&& name.getNamespaceURI().endsWith("/main"));
}
/**
* Get the hidden state for a given column
*
* @param columnIndex - the column to set (0-based)
* @return hidden - false
if the column is visible
*/
boolean isColumnHidden(int columnIndex) {
if(rowCacheIterator == null) {
getRow();
}
return hiddenColumns.contains(columnIndex);
}
/**
* Gets the last row on the sheet
*
* @return
*/
int getLastRowNum() {
if(rowCacheIterator == null) {
getRow();
}
return lastRowNum;
}
/**
* Read the numeric format string out of the styles table for this cell. Stores
* the result in the Cell.
*
* @param startElement
* @param cell
*/
void setFormatString(StartElement startElement, StreamingCell cell) {
Attribute cellStyle = startElement.getAttributeByName(new QName("s"));
String cellStyleString = (cellStyle != null) ? cellStyle.getValue() : null;
XSSFCellStyle style = null;
if(cellStyleString != null) {
style = stylesTable.getStyleAt(Integer.parseInt(cellStyleString));
} else if(stylesTable.getNumCellStyles() > 0) {
style = stylesTable.getStyleAt(0);
}
if(style != null) {
cell.setNumericFormatIndex(style.getDataFormat());
String formatString = style.getDataFormatString();
if(formatString != null) {
cell.setNumericFormat(formatString);
} else {
cell.setNumericFormat(BuiltinFormats.getBuiltinFormat(cell.getNumericFormatIndex()));
}
} else {
cell.setNumericFormatIndex(null);
cell.setNumericFormat(null);
}
}
/**
* Tries to format the contents of the last contents appropriately based on
* the type of cell and the discovered numeric format.
*
* @return
*/
Supplier formattedContents() {
return getFormatterForType(currentCell.getType());
}
/**
* Tries to format the contents of the last contents appropriately based on
* the provided type and the discovered numeric format.
*
* @return
*/
private Supplier getFormatterForType(String type) {
switch(type) {
case "s": //string stored in shared table
if (!lastContents.isEmpty()) {
int idx = Integer.parseInt(lastContents);
return new StringSupplier(new XSSFRichTextString(sst.getEntryAt(idx)).toString());
}
return new StringSupplier(lastContents);
case "inlineStr": //inline string (not in sst)
case "str":
return new StringSupplier(new XSSFRichTextString(lastContents).toString());
case "e": //error type
return new StringSupplier("ERROR: " + lastContents);
case "n": //numeric type
if(currentCell.getNumericFormat() != null && lastContents.length() > 0) {
// the formatRawCellContents operation incurs a significant overhead on large sheets,
// and we want to defer the execution of this method until the value is actually needed.
// it is not needed in all cases..
final String currentLastContents = lastContents;
final int currentNumericFormatIndex = currentCell.getNumericFormatIndex();
final String currentNumericFormat = currentCell.getNumericFormat();
return new Supplier() {
String cachedContent;
@Override
public Object getContent() {
if (cachedContent == null) {
cachedContent = dataFormatter.formatRawCellContents(
Double.parseDouble(currentLastContents),
currentNumericFormatIndex,
currentNumericFormat);
}
return cachedContent;
}
};
} else {
return new StringSupplier(lastContents);
}
default:
return new StringSupplier(lastContents);
}
}
/**
* Returns the contents of the cell, with no formatting applied
*
* @return
*/
String unformattedContents() {
switch(currentCell.getType()) {
case "s": //string stored in shared table
if (!lastContents.isEmpty()) {
int idx = Integer.parseInt(lastContents);
return new XSSFRichTextString(sst.getEntryAt(idx)).toString();
}
return lastContents;
case "inlineStr": //inline string (not in sst)
return new XSSFRichTextString(lastContents).toString();
default:
return lastContents;
}
}
/**
* Returns a new streaming iterator to loop through rows. This iterator is not
* guaranteed to have all rows in memory, and any particular iteration may
* trigger a load from disk to read in new data.
*
* @return the streaming iterator
*/
@Override
public Iterator iterator() {
return new StreamingRowIterator();
}
public void close() {
try {
parser.close();
} catch(XMLStreamException e) {
throw new CloseException(e);
}
}
class StreamingRowIterator implements Iterator {
public StreamingRowIterator() {
if(rowCacheIterator == null) {
hasNext();
}
}
@Override
public boolean hasNext() {
return (rowCacheIterator != null && rowCacheIterator.hasNext()) || getRow();
}
@Override
public Row next() {
return rowCacheIterator.next();
}
@Override
public void remove() {
throw new RuntimeException("NotSupported");
}
}
}