com.github.liaochong.myexcel.core.SaxExcelReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of myexcel Show documentation
Show all versions of myexcel Show documentation
MyExcel, a new way to operate Excel.
/*
* Copyright 2019 liaochong
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.liaochong.myexcel.core;
import com.github.liaochong.myexcel.core.cache.StringsCache;
import com.github.liaochong.myexcel.core.context.Hyperlink;
import com.github.liaochong.myexcel.core.context.ReadContext;
import com.github.liaochong.myexcel.core.context.RowContext;
import com.github.liaochong.myexcel.exception.ExcelReadException;
import com.github.liaochong.myexcel.exception.SaxReadException;
import com.github.liaochong.myexcel.exception.StopReadException;
import com.github.liaochong.myexcel.utils.ReflectUtil;
import com.github.liaochong.myexcel.utils.TempFileOperator;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.util.XMLHelper;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.slf4j.Logger;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BiFunction;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
/**
* sax模式读取excel,支持xls、xlsx、csv格式读取
*
* @author liaochong
* @version 1.0
*/
public class SaxExcelReader {
private static final int DEFAULT_SHEET_INDEX = 0;
private static final Logger log = org.slf4j.LoggerFactory.getLogger(SaxExcelReader.class);
/**
* 元数据
*/
private WorkbookMetaData workbookMetaData;
private final List result = new LinkedList<>();
private final ReadConfig readConfig = new ReadConfig<>(DEFAULT_SHEET_INDEX);
private SaxExcelReader(Class dataType) {
this.readConfig.dataType = dataType;
}
public static SaxExcelReader of(Class clazz) {
return new SaxExcelReader<>(clazz);
}
public SaxExcelReader sheet(Integer sheetIndex) {
return sheets(sheetIndex);
}
public SaxExcelReader sheet(String sheetName) {
return sheets(sheetName);
}
public SaxExcelReader sheets(Integer... sheetIndexs) {
this.readConfig.sheetIndexs.clear();
this.readConfig.sheetIndexs.addAll(Arrays.asList(sheetIndexs));
return this;
}
public SaxExcelReader sheets(String... sheetNames) {
this.readConfig.sheetNames.clear();
this.readConfig.sheetNames.addAll(Arrays.asList(sheetNames));
return this;
}
public SaxExcelReader rowFilter(Predicate rowFilter) {
this.readConfig.rowFilter = rowFilter;
return this;
}
public SaxExcelReader beanFilter(Predicate beanFilter) {
this.readConfig.beanFilter = beanFilter;
return this;
}
@Deprecated
public SaxExcelReader charset(String charset) {
this.readConfig.csvCharset = charset;
return this;
}
public SaxExcelReader csvCharset(String charset) {
this.readConfig.csvCharset = charset;
return this;
}
public SaxExcelReader csvDelimiter(char delimiter) {
this.readConfig.csvDelimiter = delimiter;
return this;
}
public SaxExcelReader exceptionally(BiFunction exceptionFunction) {
this.readConfig.exceptionFunction = exceptionFunction;
return this;
}
public SaxExcelReader noTrim() {
this.readConfig.trim = v -> v;
return this;
}
public SaxExcelReader readAllSheet() {
this.readConfig.readAllSheet = true;
return this;
}
public SaxExcelReader ignoreBlankRow() {
this.readConfig.ignoreBlankRow = true;
return this;
}
public SaxExcelReader stopReadingOnBlankRow() {
this.readConfig.stopReadingOnBlankRow = true;
return this;
}
public SaxExcelReader startSheet(BiConsumer startSheetConsumer) {
this.readConfig.startSheetConsumer = startSheetConsumer;
return this;
}
public SaxExcelReader detectedMerge() {
this.readConfig.detectedMerge = true;
return this;
}
public List read(Path path) {
doRead(path.toFile());
return result;
}
public List read(InputStream fileInputStream) {
doRead(fileInputStream);
return result;
}
public List read(File file) {
doRead(file);
return result;
}
public void readThen(InputStream fileInputStream, Consumer consumer) {
this.readConfig.consumer = consumer;
doRead(fileInputStream);
}
public void readThen(File file, Consumer consumer) {
this.readConfig.consumer = consumer;
doRead(file);
}
public void readThen(Path path, Consumer consumer) {
this.readConfig.consumer = consumer;
doRead(path.toFile());
}
public void readThen(InputStream fileInputStream, BiConsumer contextConsumer) {
this.readConfig.contextConsumer = contextConsumer;
doRead(fileInputStream);
}
public void readThen(File file, BiConsumer contextConsumer) {
this.readConfig.contextConsumer = contextConsumer;
doRead(file);
}
public void readThen(Path path, BiConsumer contextConsumer) {
this.readConfig.contextConsumer = contextConsumer;
doRead(path.toFile());
}
public void readThen(File file, BiFunction contextFunction) {
this.readConfig.contextFunction = contextFunction;
doRead(file);
}
public void readThen(InputStream fileInputStream, BiFunction contextFunction) {
this.readConfig.contextFunction = contextFunction;
doRead(fileInputStream);
}
public void readThen(Path path, BiFunction contextFunction) {
this.readConfig.contextFunction = contextFunction;
doRead(path.toFile());
}
public void readThen(InputStream fileInputStream, Function function) {
this.readConfig.function = function;
doRead(fileInputStream);
}
public void readThen(File file, Function function) {
this.readConfig.function = function;
doRead(file);
}
public void readThen(Path path, Function function) {
this.readConfig.function = function;
doRead(path.toFile());
}
public static WorkbookMetaData getWorkbookMetaData(Path path) {
SaxExcelReader saxExcelReader = new SaxExcelReader<>(null);
saxExcelReader.doRead(path.toFile(), true);
return saxExcelReader.workbookMetaData;
}
public static WorkbookMetaData getWorkbookMetaData(InputStream fileInputStream) {
SaxExcelReader saxExcelReader = new SaxExcelReader<>(null);
saxExcelReader.doRead(fileInputStream, true);
return saxExcelReader.workbookMetaData;
}
public static WorkbookMetaData getWorkbookMetaData(File file) {
SaxExcelReader saxExcelReader = new SaxExcelReader<>(null);
saxExcelReader.doRead(file, true);
return saxExcelReader.workbookMetaData;
}
private void doRead(InputStream fileInputStream) {
this.doRead(fileInputStream, false);
}
private void doRead(InputStream fileInputStream, boolean readMetaData) {
Path path = TempFileOperator.convertToFile(fileInputStream);
try {
doRead(path.toFile(), readMetaData);
} finally {
TempFileOperator.deleteTempFile(path);
}
}
private void doRead(File file) {
this.doRead(file, false);
}
private void doRead(File file, boolean readMetaData) {
FileMagic fm;
try (InputStream is = FileMagic.prepareToCheckMagic(new FileInputStream(file))) {
fm = FileMagic.valueOf(is);
} catch (Throwable throwable) {
throw new SaxReadException("Fail to get excel magic", throwable);
}
try {
switch (fm) {
case OOXML:
doReadXlsx(file, readMetaData);
break;
case OLE2:
doReadXls(file, readMetaData);
break;
default:
if (readMetaData) {
throw new UnsupportedOperationException();
}
doReadCsv(file);
}
} catch (Throwable e) {
throw new SaxReadException("Fail to read excel", e);
}
}
private void doReadXls(File file, boolean readMetaData) {
try {
if (readMetaData) {
workbookMetaData = new WorkbookMetaData();
new HSSFMetaDataSaxReadHandler(file, workbookMetaData).process();
} else {
HSSFPreReadHandler.HSSFPreData hssfPreData = null;
if (readConfig.detectedMerge || readConfig.detectedHyperlink()) {
HSSFPreReadHandler hssfPreReadHandler = new HSSFPreReadHandler(file, readConfig);
hssfPreReadHandler.process();
hssfPreData = hssfPreReadHandler.getHssfPreData();
}
if (hssfPreData == null || hssfPreData.mergeCellIndexMapping == null) {
readConfig.detectedMerge = false;
}
new HSSFSaxReadHandler<>(file, result, readConfig, hssfPreData).process();
}
} catch (StopReadException e) {
// do nothing
} catch (IOException e) {
throw new SaxReadException("Fail to read xls file:" + file.getName(), e);
}
}
private void doReadXlsx(File file, boolean readMetaData) {
try (OPCPackage p = OPCPackage.open(file, PackageAccess.READ)) {
if (readMetaData) {
processMetaData(p);
} else {
process(p);
}
} catch (StopReadException e) {
// do nothing
} catch (Exception e) {
throw new SaxReadException("Fail to read xlsx file:" + file.getName(), e);
}
}
private void doReadCsv(File file) {
try {
new CsvReadHandler<>(Files.newInputStream(file.toPath()), readConfig, result).read();
} catch (StopReadException e) {
// do nothing
} catch (Throwable throwable) {
throw new ExcelReadException("Fail to read csv file:" + file.getName(), throwable);
}
}
/**
* Initiates the processing of the XLS workbook file to CSV.
*
* @throws IOException If reading the data from the package fails.
* @throws SAXException if parsing the XML data fails.
*/
private void process(OPCPackage xlsxPackage) throws IOException, OpenXML4JException, SAXException {
long startTime = System.currentTimeMillis();
Map preDataIndexMapping = this.processPreData(xlsxPackage);
StringsCache stringsCache = new StringsCache();
try {
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(xlsxPackage, stringsCache);
this.doReadSheet(xlsxPackage, xssfReadContext -> {
readConfig.startSheetConsumer.accept(xssfReadContext.sheetName, xssfReadContext.sheetIndex);
XSSFSheetPreXMLHandler.XSSFPreData xssfPreData = preDataIndexMapping.get(xssfReadContext.sheetIndex);
ContentHandler handler = new XSSFSheetXMLHandler(
xssfPreData, strings, new XSSFSaxReadHandler<>(result, readConfig, xssfPreData));
processSheet(handler, xssfReadContext.inputStream);
preDataIndexMapping.remove(xssfReadContext.sheetIndex);
});
} finally {
stringsCache.clearAll();
}
log.info("Sax import takes {} ms", System.currentTimeMillis() - startTime);
}
private Map processPreData(OPCPackage xlsxPackage) throws IOException, OpenXML4JException {
if (!readConfig.detectedMerge && !readConfig.detectedHyperlink()) {
return Collections.emptyMap();
}
Map preDataIndexMapping = new HashMap<>();
this.doReadSheet(xlsxPackage, xssfReadContext -> {
xssfReadContext.packageRelationshipCollection = Optional.ofNullable(xssfReadContext.sheetIterator.getSheetPart())
.map(packagePart -> {
try {
return packagePart.getRelationships();
} catch (InvalidFormatException e) {
throw new SaxReadException("Get relationships failure.", e);
}
}).orElse(null);
XSSFSheetPreXMLHandler xssfSheetPreXMLHandler = new XSSFSheetPreXMLHandler(readConfig, xssfReadContext);
processSheet(xssfSheetPreXMLHandler, xssfReadContext.inputStream);
preDataIndexMapping.put(xssfReadContext.sheetIndex, xssfSheetPreXMLHandler.getXssfPreData());
});
if (preDataIndexMapping.isEmpty()) {
readConfig.detectedMerge = false;
}
return preDataIndexMapping;
}
private void processMetaData(OPCPackage xlsxPackage) throws IOException, OpenXML4JException {
workbookMetaData = new WorkbookMetaData();
readConfig.readAllSheet = true;
int lastIndex = this.doReadSheet(xlsxPackage, xssfReadContext -> {
SheetMetaData sheetMetaData = new SheetMetaData(xssfReadContext.sheetName, xssfReadContext.sheetIndex);
this.processSheet(new XSSFSheetMetaDataXMLHandler(sheetMetaData), xssfReadContext.inputStream);
// 设置元数据信息
workbookMetaData.getSheetMetaDataList().add(sheetMetaData);
});
if (lastIndex > -1) {
workbookMetaData.setSheetCount(lastIndex + 1);
}
}
private int doReadSheet(OPCPackage xlsxPackage, Consumer consumer) throws IOException, OpenXML4JException {
XSSFReader.SheetIterator iter = this.getSheetIterator(xlsxPackage);
Function acceptFunction = this.getSheetAcceptFunction();
int index = -1;
while (iter.hasNext()) {
++index;
try (InputStream stream = iter.next()) {
XSSFReadContext xssfReadContext = new XSSFReadContext(iter, stream, index, iter.getSheetName());
if (acceptFunction.apply(xssfReadContext)) {
consumer.accept(xssfReadContext);
}
}
}
return index;
}
private XSSFReader.SheetIterator getSheetIterator(OPCPackage xlsxPackage) throws IOException, OpenXML4JException {
XSSFReader xssfReader = new XSSFReader(xlsxPackage);
return (XSSFReader.SheetIterator) xssfReader.getSheetsData();
}
private Function getSheetAcceptFunction() {
Function acceptFunction = xssfReadContext -> true;
if (readConfig.readAllSheet) {
acceptFunction = xssfReadContext -> true;
} else if (!readConfig.sheetNames.isEmpty()) {
acceptFunction = xssfReadContext -> readConfig.sheetNames.contains(xssfReadContext.sheetName);
} else if (!readConfig.sheetIndexs.isEmpty()) {
acceptFunction = xssfReadContext -> readConfig.sheetIndexs.contains(xssfReadContext.sheetIndex);
}
return acceptFunction;
}
/**
* Parses and shows the content of one sheet
* using the specified styles and shared-strings tables.
*
* @param sheetInputStream The stream to read the sheet-data from.
*/
private void processSheet(
ContentHandler handler,
InputStream sheetInputStream) {
try {
XMLReader sheetParser = XMLHelper.newXMLReader();
sheetParser.setContentHandler(handler);
sheetParser.parse(new InputSource(sheetInputStream));
} catch (ParserConfigurationException | IOException | SAXException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
public static final class ReadConfig {
public Class dataType;
public Set sheetNames = new HashSet<>();
public Set sheetIndexs = new HashSet<>();
public Consumer consumer;
public BiConsumer contextConsumer;
public Function function;
public BiFunction contextFunction;
public Predicate rowFilter = row -> true;
public Predicate beanFilter = bean -> true;
public BiFunction exceptionFunction = (t, c) -> false;
public String csvCharset = "UTF-8";
public char csvDelimiter = ',';
public Function trim = v -> {
if (v == null) {
return v;
}
return v.trim();
};
public boolean readAllSheet;
/**
* 是否忽略空白行,默认为否
*/
public boolean ignoreBlankRow = false;
/**
* 是否在遇到空白行时停止读取
*/
public boolean stopReadingOnBlankRow = false;
public boolean detectedMerge;
public BiConsumer startSheetConsumer = (sheetName, sheetIndex) -> {
log.info("Start read excel, sheet:{},index:{}", sheetName, sheetIndex);
};
public ReadConfig(int sheetIndex) {
sheetIndexs.add(sheetIndex);
}
public boolean detectedHyperlink() {
if (dataType == Map.class) {
return false;
}
return ReflectUtil.getFieldDefinitionMapOfExcelColumn(dataType).values().stream().anyMatch(f -> f.getField().getType() == Hyperlink.class);
}
}
public static class XSSFReadContext {
public XSSFReader.SheetIterator sheetIterator;
public InputStream inputStream;
public Integer sheetIndex;
public String sheetName;
public PackageRelationshipCollection packageRelationshipCollection;
public XSSFReadContext(XSSFReader.SheetIterator sheetIterator, InputStream inputStream, Integer sheetIndex, String sheetName) {
this.sheetIterator = sheetIterator;
this.inputStream = inputStream;
this.sheetIndex = sheetIndex;
this.sheetName = sheetName;
}
}
}