org.pentaho.di.trans.steps.excelinput.staxpoi.StaxPoiSheet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
* Author = Shailesh Ahuja
*/
package org.pentaho.di.trans.steps.excelinput.staxpoi;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.TimeZone;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTXf;
import org.pentaho.di.core.spreadsheet.KCell;
import org.pentaho.di.core.spreadsheet.KCellType;
import org.pentaho.di.core.spreadsheet.KSheet;
/**
* Streaming reader for XLSX sheets.
* Rows should only be accessed sequentially: random access will severely impact performance.
*/
public class StaxPoiSheet implements KSheet {
// set to UTC for coherence with PoiSheet;
private static final TimeZone DATE_TZ = TimeZone.getTimeZone( "UTC" );
private final String sheetName;
private final String sheetId;
private final XSSFReader xssfReader;
private InputStream sheetStream;
private XMLStreamReader sheetReader;
// hold the pointer to the current row so that access to the next row in the stream is quick and easy
private int currentRow;
private List headerRow;
private int numRows;
private int numCols;
private boolean maxColsNumberDefined = true;
// 1-based first non-empty row
private int firstRow;
private KCell[] currentRowCells;
// full shared strings table
private SharedStringsTable sst;
// custom styles
private StylesTable styles;
public StaxPoiSheet( XSSFReader reader, String sheetName, String sheetID )
throws InvalidFormatException, IOException, XMLStreamException {
this.sheetName = sheetName;
xssfReader = reader;
sheetId = sheetID;
sst = reader.getSharedStringsTable();
styles = reader.getStylesTable();
sheetStream = reader.getSheet( sheetID );
XMLInputFactory factory = XMLInputFactory.newInstance();
sheetReader = factory.createXMLStreamReader( sheetStream );
headerRow = new ArrayList();
while ( sheetReader.hasNext() ) {
int event = sheetReader.next();
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "dimension" ) ) {
String dim = sheetReader.getAttributeValue( null, "ref" );
// empty sheets have dimension with no range
if ( StringUtils.contains( dim, ':' ) ) {
dim = dim.split( ":" )[1];
numRows = StaxUtil.extractRowNumber( dim );
numCols = StaxUtil.extractColumnNumber( dim );
} else {
maxColsNumberDefined = false;
numCols = StaxUtil.MAX_COLUMNS;
numRows = StaxUtil.MAX_ROWS;
}
}
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) {
currentRow = Integer.parseInt( sheetReader.getAttributeValue( null, "r" ) );
firstRow = currentRow;
// calculate the number of columns in the header row
while ( sheetReader.hasNext() ) {
event = sheetReader.next();
if ( event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) {
// if the row has ended, break the inner while loop
break;
}
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "c" ) ) {
String attributeValue = sheetReader.getAttributeValue( null, "t" );
if ( attributeValue != null ) {
if ( attributeValue.equals( "s" ) ) {
// if the type of the cell is string, we continue
while ( sheetReader.hasNext() ) {
event = sheetReader.next();
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "v" ) ) {
int idx = Integer.parseInt( sheetReader.getElementText() );
String content = new XSSFRichTextString( sst.getEntryAt( idx ) ).toString();
headerRow.add( content );
break;
}
}
} else if ( attributeValue.equals( "inlineStr" ) ) {
// if the type of the cell is string, we continue
while ( sheetReader.hasNext() ) {
event = sheetReader.next();
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "is" ) ) {
while ( sheetReader.hasNext() ) {
event = sheetReader.next();
if ( event == XMLStreamConstants.CHARACTERS ) {
String content = new XSSFRichTextString( sheetReader.getText() ).toString();
headerRow.add( content );
break;
}
}
break;
}
}
}
} else {
break;
}
}
}
// we have parsed the header row
break;
}
}
}
boolean isMaxColsNumberDefined() {
return maxColsNumberDefined;
}
@Override
public KCell[] getRow( int rownr ) {
// xlsx raw row numbers are 1-based index, KSheet is 0-based
if ( rownr < 0 || rownr >= numRows ) {
// KSheet requires out of bounds here
throw new ArrayIndexOutOfBoundsException( rownr );
}
if ( rownr + 1 < firstRow ) {
// before first non-empty row
return new KCell[0];
}
if ( rownr > 0 && currentRow == rownr + 1 ) {
if ( currentRowCells != null ) {
return currentRowCells;
}
// The case when the table contains the empty row(s) before the header
// but at the same time user wants to read starting from 0 row
return new KCell[0];
}
try {
if ( currentRow >= rownr + 1 ) {
// allow random access per api despite performance hit
resetSheetReader();
}
while ( sheetReader.hasNext() ) {
int event = sheetReader.next();
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) {
String rowIndicator = sheetReader.getAttributeValue( null, "r" );
currentRow = Integer.parseInt( rowIndicator );
if ( currentRow < rownr + 1 ) {
continue;
}
currentRowCells = parseRow();
return currentRowCells;
}
}
} catch ( Exception e ) {
throw new RuntimeException( e );
}
numRows = currentRow;
return new KCell[] {};
}
private KCell[] parseRow() throws XMLStreamException {
List cells;
if ( isMaxColsNumberDefined() ) {
cells = new ArrayList( numCols );
} else {
cells = new ArrayList();
}
int undefinedColIndex = 0;
for ( int i = 0; i < numCols; i++ ) {
// go to the "c" cell tag
while ( sheetReader.hasNext() ) {
int event = sheetReader.next();
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "c" ) ) {
break;
}
if ( event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) {
// premature end of row, returning what we have
return cells.toArray( new StaxPoiCell[cells.size()] );
}
}
String cellLocation = sheetReader.getAttributeValue( null, "r" );
int columnIndex = StaxUtil.extractColumnNumber( cellLocation ) - 1;
String cellType = sheetReader.getAttributeValue( null, "t" );
String cellStyle = sheetReader.getAttributeValue( null, "s" );
boolean isFormula = false;
String content = null;
// get value tag
while ( sheetReader.hasNext() ) {
int event = sheetReader.next();
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "v" ) ) {
// read content as string
if ( cellType != null && cellType.equals( "s" ) ) {
int idx = Integer.parseInt( sheetReader.getElementText() );
content = new XSSFRichTextString( sst.getEntryAt( idx ) ).toString();
} else {
content = sheetReader.getElementText();
}
}
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "is" ) ) {
while ( sheetReader.hasNext() ) {
event = sheetReader.next();
if ( event == XMLStreamConstants.CHARACTERS ) {
content = new XSSFRichTextString( sheetReader.getText() ).toString();
break;
}
}
}
if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "f" ) ) {
isFormula = true;
}
if ( event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals( "c" ) ) {
break;
}
}
if ( content != null ) {
KCellType kcType = getCellType( cellType, cellStyle, isFormula );
setCells( cells, undefinedColIndex, columnIndex, new StaxPoiCell( parseValue( kcType, content ), kcType, currentRow ) );
} else {
// else let cell be null
setCells( cells, undefinedColIndex, columnIndex, null );
}
undefinedColIndex = columnIndex + 1;
}
return cells.toArray( new StaxPoiCell[cells.size()] );
}
private static void setCells( List cellsArray, int firstUndefinedColIndex, int foundColIndex, StaxPoiCell cell ) {
// fill all cells before found with Null
for ( int index = firstUndefinedColIndex; index < foundColIndex; index++ ) {
cellsArray.add( null );
}
// add the found Cell
cellsArray.add( cell );
}
@Override
public String getName() {
return sheetName;
}
@Override
public int getRows() {
return numRows;
}
@Override
public KCell getCell( int colnr, int rownr ) {
if ( rownr == 0 && colnr < headerRow.size() ) {
// only possible to return header
return new StaxPoiCell( headerRow.get( colnr ), rownr );
}
// if random access this will be very expensive
KCell[] row = getRow( rownr );
if ( row != null && rownr < row.length ) {
return row[colnr];
}
return null;
}
private KCellType getCellType( String cellType, String cellStyle, boolean isFormula ) {
// numeric type can be implicit or 'n'
if ( cellType == null || cellType.equals( "n" ) ) {
// the only difference between date and numeric is the cell format
if ( isDateCell( cellStyle ) ) {
return isFormula ? KCellType.DATE_FORMULA : KCellType.DATE;
}
return isFormula ? KCellType.NUMBER_FORMULA : KCellType.NUMBER;
}
switch ( cellType ) {
case "s":
return KCellType.LABEL;
case "b":
return isFormula ? KCellType.BOOLEAN_FORMULA : KCellType.BOOLEAN;
case "e":
// error
return KCellType.EMPTY;
case "str":
default:
return KCellType.STRING_FORMULA;
}
}
@VisibleForTesting
protected boolean isDateCell( String cellStyle ) {
if ( cellStyle != null ) {
int styleIdx = Integer.parseInt( cellStyle );
CTXf cellXf = styles.getCellXfAt( styleIdx );
if ( cellXf != null ) {
// need id for builtin types, format if custom
short formatId = (short) cellXf.getNumFmtId();
String format = styles.getNumberFormatAt( formatId );
return DateUtil.isADateFormat( formatId, format );
}
}
return false;
}
private Object parseValue( KCellType type, String vContent ) {
if ( vContent == null ) {
return null;
}
try {
switch ( type ) {
case NUMBER:
case NUMBER_FORMULA:
return Double.parseDouble( vContent );
case BOOLEAN:
case BOOLEAN_FORMULA:
return vContent.equals( "1" );
case DATE:
case DATE_FORMULA:
Double xlDate = Double.parseDouble( vContent );
return DateUtil.getJavaDate( xlDate, DATE_TZ );
case LABEL:
case STRING_FORMULA:
case EMPTY:
default:
return vContent;
}
} catch ( Exception e ) {
return vContent;
}
}
private void resetSheetReader() throws IOException, XMLStreamException, InvalidFormatException {
sheetReader.close();
sheetStream.close();
sheetStream = xssfReader.getSheet( sheetId );
XMLInputFactory factory = XMLInputFactory.newInstance();
sheetReader = factory.createXMLStreamReader( sheetStream );
}
public void close() throws IOException, XMLStreamException {
sheetReader.close();
sheetStream.close();
}
}