All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.pjfanning.xlsx.StreamingReader Maven / Gradle / Ivy

The newest version!
package com.github.pjfanning.xlsx;

import com.github.pjfanning.xlsx.exceptions.OpenException;
import com.github.pjfanning.xlsx.exceptions.ParseException;
import com.github.pjfanning.xlsx.exceptions.ReadException;
import com.github.pjfanning.xlsx.impl.*;
import org.apache.poi.ss.usermodel.Workbook;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

/**
 * Streaming Excel workbook implementation. Most advanced features of POI are not supported.
 * Use this only if your application can handle iterating through an entire workbook, row by
 * row.
 */
public class StreamingReader implements AutoCloseable {
  private final StreamingWorkbookReader workbookReader;

  public StreamingReader(StreamingWorkbookReader workbookReader) {
    this.workbookReader = workbookReader;
  }

  /**
   * Closes the streaming resource, attempting to clean up any temporary files created.
   *
   * @throws com.github.pjfanning.xlsx.exceptions.CloseException if there is an issue closing the stream
   */
  @Override
  public void close() throws IOException {
    workbookReader.close();
  }

  public static Builder builder() {
    return new Builder();
  }

  public static class Builder {
    private int rowCacheSize = 10;
    private int bufferSize = 1024;
    private boolean avoidTempFiles = false;
    private SharedStringsImplementationType sharedStringsImplementationType = SharedStringsImplementationType.POI_READ_ONLY;
    private boolean encryptSstTempFile = false;
    private CommentsImplementationType commentsImplementationType = CommentsImplementationType.POI_DEFAULT;
    private boolean encryptCommentsTempFile = false;
    private boolean adjustLegacyComments = false;
    private boolean readComments = false;
    private boolean readCoreProperties = false;
    private boolean readHyperlinks = false;
    private boolean readShapes = false;
    private boolean readStyles = true;
    private boolean readSharedFormulas = false;
    private boolean fullFormatRichText = false;
    private String password;

    /**
     * Gets the number of rows to keep in memory at any given point.
     * 

* Defaults to 10. *

* * @return number of rows to keep in memory at any given point */ public int getRowCacheSize() { return rowCacheSize; } /** * Gets the number of bytes to read into memory from the input * resource. *

* Defaults to 1024. *

* * @return the number of bytes to read into memory from the input resource */ public int getBufferSize() { return bufferSize; } /** * @return The password to use to unlock this workbook */ public String getPassword() { return password; } /** * @return Whether to avoid temp files when reading input streams. */ public boolean avoidTempFiles() { return avoidTempFiles; } /** * @return the type of shared string table implementation (default is POI_READ_ONLY). * @see #setSharedStringsImplementationType(SharedStringsImplementationType) * @since v3.5.0 */ public SharedStringsImplementationType getSharedStringsImplementationType() { return sharedStringsImplementationType; } /** * @return the type of comments table implementation (default is POI_DEFAULT). * @see #setCommentsImplementationType(CommentsImplementationType) * @since v3.5.0 */ public CommentsImplementationType getCommentsImplementationType() { return commentsImplementationType; } /** * @return Whether to use a temp file for the Shared Strings data. If false, no * temp file will be used and the entire table will be loaded into memory. * @deprecated use #getSharedStringsImplementationType() */ @Deprecated public boolean useSstTempFile() { return getSharedStringsImplementationType() == SharedStringsImplementationType.TEMP_FILE_BACKED; } /** * @return Whether to use {@link org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable} instead * of {@link org.apache.poi.xssf.model.SharedStringsTable}. If you use {@link #setUseSstTempFile(boolean)} * and set to `true`, then this setting is ignored. * * @see #useSstTempFile() * @since v3.3.0 * @deprecated use #getSharedStringsImplementationType() */ @Deprecated public boolean useSstReadOnly() { return getSharedStringsImplementationType() == SharedStringsImplementationType.POI_READ_ONLY; } /** * @return Whether to encrypt the temp file for the Shared Strings data. Only applies if useSstTempFile() * is true. */ public boolean encryptSstTempFile() { return encryptSstTempFile; } /** * @return Whether to adjust comments to remove boilerplate text (related to threaded comments). * See https://github.com/pjfanning/excel-streaming-reader/issues/57 */ public boolean adjustLegacyComments() { return adjustLegacyComments; } /** * @return Whether to read the comments associated with sheets and cells. */ public boolean readComments() { return readComments; } /** * @return Whether to use a temp file for the Comments data. If false, no * temp file will be used and the entire table will be loaded into memory. * @deprecated use {@link #getCommentsImplementationType()} */ @Deprecated public boolean useCommentsTempFile() { return getCommentsImplementationType() == CommentsImplementationType.TEMP_FILE_BACKED; } /** * @return Whether to encrypt the temp file for the Comments data. Only applies if useCommentsTempFile() * is true. */ public boolean encryptCommentsTempFile() { return encryptCommentsTempFile; } /** * @return Whether to read the core document properties. */ public boolean readCoreProperties() { return readCoreProperties; } /** * @return Whether to read the hyperlink data that appear in sheets. */ public boolean readHyperlinks() { return readHyperlinks; } /** * @return Whether to read the shapes (associated with pictures that appear in sheets). */ public boolean readShapes() { return readShapes; } /** * @return Whether to read the shared formulas. Only affects cell formulas, cell values are retrieved * using values stored in the sheet data. */ public boolean readSharedFormulas() { return readSharedFormulas; } /** * @return Whether to read the styles (associated with cells). Defaults to true. * @since v3.3.0 */ public boolean readStyles() { return readStyles; } /** * Whether to parse the full formatting data for rich text shared strings and comments. * This only has an effect if temp file SST and/or Comments Table support is enabled. The default is false. * When you don't use temp file support, full formatting data is returned for the rich text anyway. * @return Whether to parse the full formatting data for rich text shared strings and comments. * @see #useSstTempFile() * @see #useCommentsTempFile() */ public boolean fullFormatRichText() { return fullFormatRichText; } /** * The number of rows to keep in memory at any given point. *

* Defaults to 10. *

* * @param rowCacheSize number of rows * @return reference to current {@code Builder} */ public Builder rowCacheSize(int rowCacheSize) { this.rowCacheSize = rowCacheSize; return this; } /** * The number of bytes to read into memory from the input * resource. *

* Defaults to 1024. *

* * @param bufferSize buffer size in bytes * @return reference to current {@code Builder} */ public Builder bufferSize(int bufferSize) { this.bufferSize = bufferSize; return this; } /** * For password protected files, specify password to open file. * If the password is incorrect a {@code ReadException} is thrown on * {@code read}. *

NULL indicates that no password should be used, this is the * default value.

* * @param password to use when opening file * @return reference to current {@code Builder} */ public Builder password(String password) { this.password = password; return this; } /** * Enables a mode where the code tries to avoid creating temp files. This is independent of * {@code #setUseSstTempFile} and {@code #setUseCommentsTempFile}. *

* By default, temp files are used to avoid holding onto too much data in memory. * * @param avoidTempFiles whether to avoid using temp files when reading from input streams * @return reference to current {@code Builder} */ public Builder setAvoidTempFiles(boolean avoidTempFiles) { this.avoidTempFiles = avoidTempFiles; return this; } /** * Set the type of Shared Strings Table to use. The default is POI_READ_ONLY. *

* If you enable this feature, you may also want to enable fullFormatRichText. *

* poi-shared-strings is an optional dependency since v5.0.0 - you will need to include it in your project * if you want to use TEMP_FILE_BACKED or CUSTOM_MAP_BACKED implementations. *

* * @param sharedStringsImplementationType type of Shared Strings Table to use (must not be null) * @return reference to current {@code Builder} * @throws NullPointerException if null is passed as a param * @see #getSharedStringsImplementationType() * @since v3.5.0 */ public Builder setSharedStringsImplementationType(SharedStringsImplementationType sharedStringsImplementationType) { if (sharedStringsImplementationType == null) { throw new NullPointerException("sharedStringsImplementationType must not be null"); } this.sharedStringsImplementationType = sharedStringsImplementationType; return this; } /** * Set the type of Comments Table to use. The default is POI_DEFAULT. *

* If you enable this feature, you may also want to enable fullFormatRichText. *

* poi-shared-strings is an optional dependency since v5.0.0 - you will need to include it in your project * if you want to use TEMP_FILE_BACKED or CUSTOM_MAP_BACKED implementations. *

* * @param commentsImplementationType type of Comments Table to use (must not be null) * @return reference to current {@code Builder} * @throws NullPointerException if null is passed as a param * @see #getCommentsImplementationType() * @since v3.5.0 */ public Builder setCommentsImplementationType(CommentsImplementationType commentsImplementationType) { if (commentsImplementationType == null) { throw new NullPointerException("commentsImplementationType must not be null"); } this.commentsImplementationType = commentsImplementationType; return this; } /** * Enables use of Shared Strings Table temp file. This option exists to accommodate * extremely large workbooks with millions of unique strings. Normally, the SST is entirely * loaded into memory, but with large workbooks with high cardinality (i.e., very few * duplicate values) the SST may not fit entirely into memory. *

* By default, the entire SST *will* be loaded into memory. However, * enabling this option at all will have some noticeable performance degradation as you are * trading memory for disk space. *

* If you enable this feature, you may also want to enable fullFormatRichText. * * @param useSstTempFile whether to use a temp file to store the Shared Strings Table data * @return reference to current {@code Builder} * @see #setEncryptSstTempFile(boolean) * @see #setFullFormatRichText(boolean) * @see #setUseSstReadOnly(boolean) * @deprecated use {@link #setSharedStringsImplementationType(SharedStringsImplementationType)} */ @Deprecated public Builder setUseSstTempFile(boolean useSstTempFile) { if (useSstTempFile) { return setSharedStringsImplementationType(SharedStringsImplementationType.TEMP_FILE_BACKED); } else { return setSharedStringsImplementationType(SharedStringsImplementationType.POI_READ_ONLY); } } /** * If you use an in memory Shared String Table (default), this controls which in memory implementation to use. * {@link org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable} is a simpler implementation than * the default {@link org.apache.poi.xssf.model.SharedStringsTable} and uses less memory - but may not support formatted * text as well. * * @param useSstReadOnly Whether to use {@link org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable} instead * of {@link org.apache.poi.xssf.model.SharedStringsTable}. * @return reference to current {@code Builder} * @see #setUseSstTempFile(boolean) * @since v3.3.0 * @deprecated use {@link #setSharedStringsImplementationType(SharedStringsImplementationType)} */ @Deprecated public Builder setUseSstReadOnly(boolean useSstReadOnly) { if (useSstReadOnly) { return setSharedStringsImplementationType(SharedStringsImplementationType.POI_READ_ONLY); } else { return setSharedStringsImplementationType(SharedStringsImplementationType.POI_DEFAULT); } } /** * Enables use of encryption in Shared Strings Table temp file. This only applies if setUseSstTempFile * is set to true. *

* By default, the temp file is not encrypted. However, * enabling this option could slow down the processing of Shared Strings data. * * @param encryptSstTempFile whether to encrypt the temp file used to store the Shared Strings Table data * @return reference to current {@code Builder} * @see #setUseSstTempFile(boolean) */ public Builder setEncryptSstTempFile(boolean encryptSstTempFile) { this.encryptSstTempFile = encryptSstTempFile; return this; } /** * Enables use of Comments temp file. This option exists to accommodate * workbooks with lots of comments. Normally, the Comments are all * loaded into memory. *

* By default, all the Comments data *will* be loaded into memory. However, * enabling this option at all will have some noticeable performance degradation as you are * trading memory for disk space. *

* If you enable this feature, you also want to enable fullFormatRichText. * * @param useCommentsTempFile whether to use a temp file to store the Comments data * @return reference to current {@code Builder} * @see #setReadComments(boolean) * @see #setEncryptCommentsTempFile(boolean) * @see #setFullFormatRichText(boolean) * @deprecated use {@link #setCommentsImplementationType(CommentsImplementationType)} */ @Deprecated public Builder setUseCommentsTempFile(boolean useCommentsTempFile) { if (useCommentsTempFile) { return setCommentsImplementationType(CommentsImplementationType.TEMP_FILE_BACKED); } else { return setCommentsImplementationType(CommentsImplementationType.POI_DEFAULT); } } /** * Enables use of encryption in the Comments temp file. This only applies if setUseCommentsTempFile * is set to true. *

* By default, the temp file is not encrypted. However, * enabling this option could slow down the processing of Comments data. * * @param encryptCommentsTempFile whether to encrypt the temp file used to store the Comments data * @return reference to current {@code Builder} * @see #setReadComments(boolean) * @see #setUseCommentsTempFile(boolean) */ public Builder setEncryptCommentsTempFile(boolean encryptCommentsTempFile) { this.encryptCommentsTempFile = encryptCommentsTempFile; return this; } /** * Enables the reading of the comments. * * @param readComments whether to read the comments associated with sheets and cells * @return reference to current {@code Builder} */ public Builder setReadComments(boolean readComments) { this.readComments = readComments; return this; } /** * Enables adjustments to comments to remove boilerplate text (related to threaded comments). * See https://github.com/pjfanning/excel-streaming-reader/issues/57. * * @param adjustLegacyComments whether to adjust legacy comments to remove boilerplate comments * @return reference to current {@code Builder} */ public Builder setAdjustLegacyComments(boolean adjustLegacyComments) { this.adjustLegacyComments = adjustLegacyComments; return this; } /** * Enables the reading of the core document properties. * * @param readCoreProperties whether to read the core document properties * @return reference to current {@code Builder} */ public Builder setReadCoreProperties(boolean readCoreProperties) { this.readCoreProperties = readCoreProperties; return this; } /** * Enables the reading of hyperlink data associated with sheets. * * @param readHyperlinks whether to read hyperlink data (associated with sheets) * @return reference to current {@code Builder} */ public Builder setReadHyperlinks(boolean readHyperlinks) { this.readHyperlinks = readHyperlinks; return this; } /** * Enables the reading of shape data. * * @param readShapes whether to read shapes (associated with pictures that appear in sheets) * @return reference to current {@code Builder} */ public Builder setReadShapes(boolean readShapes) { this.readShapes = readShapes; return this; } /** * Enables the reading of shared formulas. This is disabled by default. This feature is experimental. * Only affects cell formulas, cell values are retrieved using values stored in the sheet data. * * @param readSharedFormulas whether to read shared formulas * @return reference to current {@code Builder} */ public Builder setReadSharedFormulas(boolean readSharedFormulas) { this.readSharedFormulas = readSharedFormulas; return this; } /** * Enables/disables the reading of styles data. Enabled, by default. * It is recommended that you only disable this if you need to absolutely maximise performance. *

* The style data is very useful for formatting numbers in particular because the raw numbers in the * Excel file are in double precision format and may not match exactly what you see in the Excel cell. *

*

* With date and timestamp data, the raw data is also numeric and without the style data, the reader * will treat the data as numeric. If you already know if certain cells hold date or timestamp data, * the the getLocalDateTimeCellValue and getDateCellValue methods will work * even if you have disabled the reading of style data. *

* * @param readStyles Whether to read the styles (associated with cells) * @return reference to current {@code Builder} * @since v3.3.0 */ public Builder setReadStyles(boolean readStyles) { this.readStyles = readStyles; return this; } /** * Whether to parse the full formatting data for rich text shared strings and comments. * This only has an effect if temp file SST and/or Comments Table support is enabled. The default is false. * When you don't use temp file support, full formatting data is returned for the rich text anyway. * @param fullFormatRichText Whether to parse the full formatting data for rich text shared strings and comments. * @return reference to current {@code Builder} * @see #setUseSstTempFile(boolean) * @see #setUseCommentsTempFile(boolean) */ public Builder setFullFormatRichText(boolean fullFormatRichText) { this.fullFormatRichText = fullFormatRichText; return this; } /** * Reads a given {@code InputStream} and returns a new * instance of {@code Workbook}. Due to Apache POI * limitations, a temporary file must be written in order * to create a streaming iterator. This process will use * the same buffer size as specified in {@link #bufferSize(int)}. * * @param is input stream to read in * @return A {@link Workbook} that can be read from * @throws com.github.pjfanning.xlsx.exceptions.ReadException if there is an issue reading the stream */ public Workbook open(InputStream is) throws OpenException, ReadException, ParseException { StreamingWorkbookReader workbookReader = new StreamingWorkbookReader(this); workbookReader.init(is); return new StreamingWorkbook(workbookReader); } /** * Reads a given {@code File} and returns a new instance * of {@code Workbook}. * * @param file file to read in * @return built streaming reader instance * @throws com.github.pjfanning.xlsx.exceptions.OpenException if there is an issue opening the file * @throws com.github.pjfanning.xlsx.exceptions.ReadException if there is an issue reading the file */ public Workbook open(File file) throws OpenException, ReadException, ParseException { StreamingWorkbookReader workbookReader = new StreamingWorkbookReader(this); workbookReader.init(file); return new StreamingWorkbook(workbookReader); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy