All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.poi.hssf.extractor.ExcelExtractor Maven / Gradle / Ivy

There is a newer version: 5.2.5
Show newest version
/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.hssf.extractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Locale;

import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFComment;
import org.apache.poi.hssf.usermodel.HSSFDataFormatter;
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.formula.eval.ErrorEval;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;

/**
 * A text extractor for Excel files.
 * 

* Returns the textual content of the file, suitable for * indexing by something like Lucene, but not really * intended for display to the user. *

*

* To turn an excel file into a CSV or similar, then see * the XLS2CSVmra example *

* * @see XLS2CSVmra */ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { private HSSFWorkbook _wb; private HSSFDataFormatter _formatter; private boolean _includeSheetNames = true; private boolean _shouldEvaluateFormulas = true; private boolean _includeCellComments = false; private boolean _includeBlankCells = false; private boolean _includeHeadersFooters = true; public ExcelExtractor(HSSFWorkbook wb) { super(wb); _wb = wb; _formatter = new HSSFDataFormatter(); } public ExcelExtractor(POIFSFileSystem fs) throws IOException { this(fs.getRoot()); } public ExcelExtractor(DirectoryNode dir) throws IOException { this(new HSSFWorkbook(dir, true)); } private static final class CommandParseException extends Exception { public CommandParseException(String msg) { super(msg); } } private static final class CommandArgs { private final boolean _requestHelp; private final File _inputFile; private final boolean _showSheetNames; private final boolean _evaluateFormulas; private final boolean _showCellComments; private final boolean _showBlankCells; private final boolean _headersFooters; public CommandArgs(String[] args) throws CommandParseException { int nArgs = args.length; File inputFile = null; boolean requestHelp = false; boolean showSheetNames = true; boolean evaluateFormulas = true; boolean showCellComments = false; boolean showBlankCells = false; boolean headersFooters = true; for (int i=0; i= nArgs) { throw new CommandParseException("Expected filename after '-i'"); } arg = args[i]; if (inputFile != null) { throw new CommandParseException("Only one input file can be supplied"); } inputFile = new File(arg); if (!inputFile.exists()) { throw new CommandParseException("Specified input file '" + arg + "' does not exist"); } if (inputFile.isDirectory()) { throw new CommandParseException("Specified input file '" + arg + "' is a directory"); } continue; } if ("--show-sheet-names".equals(arg)) { showSheetNames = parseBoolArg(args, ++i); continue; } if ("--evaluate-formulas".equals(arg)) { evaluateFormulas = parseBoolArg(args, ++i); continue; } if ("--show-comments".equals(arg)) { showCellComments = parseBoolArg(args, ++i); continue; } if ("--show-blanks".equals(arg)) { showBlankCells = parseBoolArg(args, ++i); continue; } if ("--headers-footers".equals(arg)) { headersFooters = parseBoolArg(args, ++i); continue; } throw new CommandParseException("Invalid argument '" + arg + "'"); } _requestHelp = requestHelp; _inputFile = inputFile; _showSheetNames = showSheetNames; _evaluateFormulas = evaluateFormulas; _showCellComments = showCellComments; _showBlankCells = showBlankCells; _headersFooters = headersFooters; } private static boolean parseBoolArg(String[] args, int i) throws CommandParseException { if (i >= args.length) { throw new CommandParseException("Expected value after '" + args[i-1] + "'"); } String value = args[i].toUpperCase(Locale.ROOT); if ("Y".equals(value) || "YES".equals(value) || "ON".equals(value) || "TRUE".equals(value)) { return true; } if ("N".equals(value) || "NO".equals(value) || "OFF".equals(value) || "FALSE".equals(value)) { return false; } throw new CommandParseException("Invalid value '" + args[i] + "' for '" + args[i-1] + "'. Expected 'Y' or 'N'"); } public boolean isRequestHelp() { return _requestHelp; } public File getInputFile() { return _inputFile; } public boolean shouldShowSheetNames() { return _showSheetNames; } public boolean shouldEvaluateFormulas() { return _evaluateFormulas; } public boolean shouldShowCellComments() { return _showCellComments; } public boolean shouldShowBlankCells() { return _showBlankCells; } public boolean shouldIncludeHeadersFooters() { return _headersFooters; } } private static void printUsageMessage(PrintStream ps) { ps.println("Use:"); ps.println(" " + ExcelExtractor.class.getName() + " [ [ [...]]] [-i ]"); ps.println(" -i specifies input file (default is to use stdin)"); ps.println(" Flags can be set on or off by using the values 'Y' or 'N'."); ps.println(" Following are available flags and their default values:"); ps.println(" --show-sheet-names Y"); ps.println(" --evaluate-formulas Y"); ps.println(" --show-comments N"); ps.println(" --show-blanks Y"); ps.println(" --headers-footers Y"); } /** * Command line extractor. * @throws IOException */ public static void main(String[] args) throws IOException { CommandArgs cmdArgs; try { cmdArgs = new CommandArgs(args); } catch (CommandParseException e) { System.err.println(e.getMessage()); printUsageMessage(System.err); System.exit(1); return; // suppress compiler error } if (cmdArgs.isRequestHelp()) { printUsageMessage(System.out); return; } InputStream is; if(cmdArgs.getInputFile() == null) { is = System.in; } else { is = new FileInputStream(cmdArgs.getInputFile()); } HSSFWorkbook wb = new HSSFWorkbook(is); is.close(); ExcelExtractor extractor = new ExcelExtractor(wb); extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames()); extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas()); extractor.setIncludeCellComments(cmdArgs.shouldShowCellComments()); extractor.setIncludeBlankCells(cmdArgs.shouldShowBlankCells()); extractor.setIncludeHeadersFooters(cmdArgs.shouldIncludeHeadersFooters()); System.out.println(extractor.getText()); extractor.close(); wb.close(); } /** * Should sheet names be included? Default is true */ public void setIncludeSheetNames(boolean includeSheetNames) { _includeSheetNames = includeSheetNames; } /** * Should we return the formula itself, and not * the result it produces? Default is false */ public void setFormulasNotResults(boolean formulasNotResults) { _shouldEvaluateFormulas = !formulasNotResults; } /** * Should cell comments be included? Default is false */ public void setIncludeCellComments(boolean includeCellComments) { _includeCellComments = includeCellComments; } /** * Should blank cells be output? Default is to only * output cells that are present in the file and are * non-blank. */ public void setIncludeBlankCells(boolean includeBlankCells) { _includeBlankCells = includeBlankCells; } /** * Should headers and footers be included in the output? * Default is to include them. */ public void setIncludeHeadersFooters(boolean includeHeadersFooters) { _includeHeadersFooters = includeHeadersFooters; } /** * Retrieves the text contents of the file */ public String getText() { StringBuffer text = new StringBuffer(); // We don't care about the difference between // null (missing) and blank cells _wb.setMissingCellPolicy(MissingCellPolicy.RETURN_BLANK_AS_NULL); // Process each sheet in turn for(int i=0;i<_wb.getNumberOfSheets();i++) { HSSFSheet sheet = _wb.getSheetAt(i); if(sheet == null) { continue; } if(_includeSheetNames) { String name = _wb.getSheetName(i); if(name != null) { text.append(name); text.append("\n"); } } // Header text, if there is any if(_includeHeadersFooters) { text.append(_extractHeaderFooter(sheet.getHeader())); } int firstRow = sheet.getFirstRowNum(); int lastRow = sheet.getLastRowNum(); for(int j=firstRow;j<=lastRow;j++) { HSSFRow row = sheet.getRow(j); if(row == null) { continue; } // Check each cell in turn int firstCell = row.getFirstCellNum(); int lastCell = row.getLastCellNum(); if(_includeBlankCells) { firstCell = 0; } for(int k=firstCell;k 0) { text.append(str.toString()); } break; case HSSFCell.CELL_TYPE_NUMERIC: HSSFCellStyle style = cell.getCellStyle(); double nVal = cell.getNumericCellValue(); short df = style.getDataFormat(); String dfs = style.getDataFormatString(); text.append(_formatter.formatRawCellContents(nVal, df, dfs)); break; case HSSFCell.CELL_TYPE_BOOLEAN: text.append(cell.getBooleanCellValue()); break; case HSSFCell.CELL_TYPE_ERROR: text.append(ErrorEval.getText(cell.getErrorCellValue())); break; default: throw new IllegalStateException("Unexpected cell cached formula result type: " + cell.getCachedFormulaResultType()); } } break; default: throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")"); } // Output the comment, if requested and exists HSSFComment comment = cell.getCellComment(); if(_includeCellComments && comment != null) { // Replace any newlines with spaces, otherwise it // breaks the output String commentText = comment.getString().getString().replace('\n', ' '); text.append(" Comment by "+comment.getAuthor()+": "+commentText); } } // Output a tab if we're not on the last cell if(outputContents && k < (lastCell-1)) { text.append("\t"); } } // Finish off the row text.append("\n"); } // Finally Footer text, if there is any if(_includeHeadersFooters) { text.append(_extractHeaderFooter(sheet.getFooter())); } } return text.toString(); } public static String _extractHeaderFooter(HeaderFooter hf) { StringBuffer text = new StringBuffer(); if(hf.getLeft() != null) { text.append(hf.getLeft()); } if(hf.getCenter() != null) { if(text.length() > 0) text.append("\t"); text.append(hf.getCenter()); } if(hf.getRight() != null) { if(text.length() > 0) text.append("\t"); text.append(hf.getRight()); } if(text.length() > 0) text.append("\n"); return text.toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy