com.mdazad.chunkysax.ExcelChunkySAX Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of excel-chunky-sax Show documentation
Show all versions of excel-chunky-sax Show documentation
The ExcelChunkySAX library provides a way to process large Excel files in chunks using a SAX parser.
It reads the Excel file in chunks and performs the specified action on each chunk.
The library provides an interface ChunkAction that needs to be implemented to define the action to be performed on each chunk.
The library uses Apache POI library to read the Excel file and SAX parser to parse the XML data.
Note: any type of date format should be converted to yyyy-MM-dd.
The newest version!
package com.mdazad.chunkysax;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
/**
* The ExcelStreamer class provides a way to process large Excel files in chunks
* using SAX parser.
* It reads the Excel file in chunks and performs the specified action on each
* chunk.
* The class provides an interface ChunkAction that needs to be implemented to
* define the action to be performed on each chunk.
* The class uses Apache POI library to read the Excel file and SAX parser to
* parse the XML data.
*
* Note: any type of date format should be converted to yyyy-MM-dd
*/
public class ExcelChunkySAX {
public interface ChunkAction {
void performActionsForChunk(List> chunkData, Boolean isLast);
}
/**
* Processes an Excel file in chunks using the provided chunk size and action.
*
* @param inputStream The input stream of the Excel file to be processed.
* @param chunkSize The size of each chunk to be processed.
* @param action The action to be performed on each chunk.
* @throws Exception If an error occurs while processing the Excel file.
*/
public void processExcelFileInChunks(
InputStream inputStream, int chunkSize, ChunkAction action) throws Exception {
// check if file is empty
if (inputStream.available() == 0) {
action.performActionsForChunk(new ArrayList<>(), true);
return;
}
IOUtils.setByteArrayMaxOverride(1024 * 1024 * 300); // 300 MB
OPCPackage pkg = OPCPackage.open(inputStream);
XSSFReader reader = new XSSFReader(pkg);
SharedStringsTable sharedStringsTable = (SharedStringsTable) reader.getSharedStringsTable();
StylesTable stylesTable = reader.getStylesTable();
XMLReader parser = fetchSheetParser(sharedStringsTable, stylesTable);
Iterator sheets = reader.getSheetsData();
while (sheets.hasNext()) {
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
SheetHandler handler = new SheetHandler(sharedStringsTable, stylesTable, chunkSize, action);
parser.setContentHandler(handler);
parser.parse(sheetSource);
sheet.close();
}
}
protected XMLReader fetchSheetParser(SharedStringsTable sharedStringsTable, StylesTable stylesTable)
throws SAXException, ParserConfigurationException {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
XMLReader parser = saxParser.getXMLReader();
ContentHandler handler = new SheetHandler(sharedStringsTable, stylesTable);
parser.setContentHandler(handler);
return parser;
}
private class SheetHandler extends DefaultHandler {
private SharedStringsTable sharedStringsTable;
private StylesTable stylesTable;
private String lastContents;
private boolean nextIsString;
private boolean nextIsStyledNumeric;
private boolean inlineStr;
private int styleIndex;
private DataFormatter formatter;
protected Map header = new LinkedHashMap<>();
protected Map rowValues = new LinkedHashMap<>();
private Map row = new LinkedHashMap<>();
private int chunkSize;
private ChunkAction action;
private List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy