de.h2b.java.lib.office.ExcelReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pa-toolbox-office Show documentation
The newest version!
/*
  PA-Toolbox -- Predictive Analytics Java Toolbox

  Copyright 2014-2016 Hans-Hermann Bode

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
*/

package de.h2b.java.lib.office;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;

/**
 * Implements methods to read Excel sheets and convert the results into suitable
 * data structures. 
 * 
 * The columns of the first row are interpreted as keys. This is reflected by
 * mapping a column index to its key where columns with empty keys are skipped.
 * 
 * The remaining rows are converted to a list of maps, where each item of the 
 * list represents a row of the sheet, mapping the keys defined above to the
 * values of the cells, respectively. Both keys and values of the map are 
 * strings. Note that entries of the map may be {@code null} (resulting 
 * from, e.g., empty cells), but their will be no entry for columns with empty
 * keys; also, rows containing empty cells only will be skipped.
 * 
 * @author h2b
 *
 */
public class ExcelReader {
	
	private List> content;
	
	private Map keys;
	
	/**
	 * Creates this object from a stream of excel data using the sheet with 
	 * index 0.
	 * 
	 * @param stream the input stream to read excel data from
	 * @throws OfficeFormatException
	 * @throws IOException
	 */
	public ExcelReader(InputStream stream) throws OfficeFormatException, IOException {
		this(stream, 0);
	}

	/**
	 * Creates this object from a stream of excel data using the sheet with the 
	 * given index.
	 * 
	 * @param stream the input stream to read excel data from
	 * @param sheetIdx the index of the sheet to be processed (counting from 0)
	 * @throws OfficeFormatException
	 * @throws IOException
	 */
	public ExcelReader(InputStream stream, int sheetIdx) throws OfficeFormatException, IOException {
		super();
		try {
			readRows(stream, sheetIdx);
		} catch (InvalidFormatException e) {
			//get rid of POI-specific extension
			throw new OfficeFormatException(e);
		}
	}
	
	/**
	 * Creates this object from an excel file using the sheet with index 0.
	 * 
	 * @param file the excel file to read from
	 * @throws OfficeFormatException
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	public ExcelReader(File file) throws OfficeFormatException, FileNotFoundException, IOException {
		this(file, 0);
	}

	/**
	 * Creates this object from an excel file using the sheet with the given 
	 * index.
	 * 
	 * @param file the excel file to read from
	 * @param sheetIdx the index of the sheet to be processed (counting from 0)
	 * @throws OfficeFormatException
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	public ExcelReader(File file, int sheetIdx) throws OfficeFormatException, FileNotFoundException, IOException {
		super();
		try {
			readRows(file, sheetIdx);
		} catch (InvalidFormatException e) {
			//get rid of POI-specific extension
			throw new OfficeFormatException(e);
		}
	}

	/**
	 * @param stream the input stream to read from
	 * @param idx the index of the sheet to be processed (counting from 0)
	 * @return the list of maps
	 * @throws InvalidFormatException
	 * @throws IOException
	 */
	private void readRows(InputStream stream, int idx)
			throws InvalidFormatException, IOException {
		Workbook wb = WorkbookFactory.create(stream);
		Sheet sheet = wb.getSheetAt(idx);
		Iterator iterator = sheet.rowIterator();
		if (iterator.hasNext()) {
			//first process row containing keys
			keys = readKeys(iterator.next());
			//then process remaining rows as ordinary content
			content = new ArrayList>();
			while (iterator.hasNext()) {
				Row row = iterator.next();
				Map rowContent = readContent(row);
				if (rowContent!=null) {
					//although the iterator already should skip empty rows,
					//apparently sometimes it doesn't, 
					//thus this additional check
					content.add(rowContent);
				}
			}
		}
	}

	/**
	 * Note: {@link #keys} must be set before.
	 * 
	 * @param row
	 * @return a map containing the values of the row specified, including 
	 *         {@code null} values, but restricted to the columns that have keys
	 */
	private Map readContent(Row row) {
		int size = keys.size();
		Map result = new HashMap(size);
		boolean empty = true;
		for (int j : keys.keySet()) {
			String val = getValueAsString(row, j);
			result.put(keys.get(j), val);
			if (val!=null&&!val.isEmpty()) {
				empty = false;
			}
		}
		return empty? null: result;
	}

	/**
	 * @param row
	 * @return a map containing the non-{@code null} values of the row specified
	 */
	private Map readKeys(Row row) {
		int size = row.getLastCellNum();
		Map result = new HashMap(size);
		for (int j = 0; j < size; j++) {
			String val = getValueAsString(row, j);
			if (val!=null) {
				result.put(j, val);
			}
		}
		return result;
	}

	/**
	 * @param f the file to read from
	 * @param idx the index of the sheet to be processed (counting from 0)
	 * @return the list of maps
	 * @throws IOException
	 * @throws InvalidFormatException 
	 */
	private void readRows(File f, int idx) 
			throws FileNotFoundException, IOException, InvalidFormatException {
		FileInputStream stream = null;
		try {
			stream = new FileInputStream(f);
			readRows(stream, idx);
		} finally {
			if (stream!=null) {
				stream.close();
			}
		}
	}

	/**
	 * @param row
	 * @param col
	 * @return string formatted value or {@code null} if cell is {@code null} or
	 *         of improper type (i.e., not meaningful convertible to string)
	 */
	private static String getValueAsString(Row row, int col) {
		Cell cell = row.getCell(col, Row.RETURN_BLANK_AS_NULL);
		return getValueAsString(cell);
	}

	/**
	 * @param cell
	 * @return string formatted value or {@code null} if cell is {@code null} or
	 *         of improper type (i.e., not meaningful convertible to string)
	 */
	private static String getValueAsString(Cell cell) {
		if (cell==null) {
			return null;
		}
		switch (cell.getCellType()) {
		case Cell.CELL_TYPE_STRING:
			return cell.getStringCellValue().trim();
		case Cell.CELL_TYPE_NUMERIC:
			return Double.toString(cell.getNumericCellValue()).trim();
		case Cell.CELL_TYPE_BOOLEAN:
			return Boolean.toString(cell.getBooleanCellValue()).trim();
		default:
			return null;
		}
	}

	/**
	 * @return the content
	 */
	public List> getContent() {
		return content;
	}

	/**
	 * @return the keys
	 */
	public Map getKeys() {
		return keys;
	}

}