All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.io.xlsx.XlsxReader Maven / Gradle / Ivy

There is a newer version: 0.43.1
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package tech.tablesaw.io.xlsx;

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;

import javax.annotation.concurrent.Immutable;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import tech.tablesaw.api.ColumnType;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.LongColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.Column;
import tech.tablesaw.io.DataReader;
import tech.tablesaw.io.ReaderRegistry;
import tech.tablesaw.io.Source;

@Immutable
public class XlsxReader implements DataReader {

    private static final XlsxReader INSTANCE = new XlsxReader();

    static {
        register(Table.defaultReaderRegistry);
    }

    public static void register(ReaderRegistry registry) {
        registry.registerExtension("xlsx", INSTANCE);
        registry.registerMimeType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", INSTANCE);
        registry.registerOptions(XlsxReadOptions.class, INSTANCE);
    }

    @Override
    public Table read(XlsxReadOptions options) throws IOException {
        List tables = readMultiple(options);
        if (tables.isEmpty()) {
            throw new IllegalArgumentException("No tables found.");            
        }
        return tables.get(0);
    }

    public List
readMultiple(XlsxReadOptions options) throws IOException { byte[] bytes = null; InputStream input = getInputStream(options, bytes); List
tables = new ArrayList
(); try (XSSFWorkbook workbook = new XSSFWorkbook(input)) { for (Sheet sheet : workbook) { TableRange tableArea = findTableArea(sheet); if (tableArea != null) { Table table = createTable(sheet, tableArea, options); tables.add(table); } } return tables; } finally { if (options.source().reader() == null) { // if we get a reader back from options it means the client opened it, so let // the client close it // if it's null, we close it here. input.close(); } } } private Boolean isBlank(Cell cell) { switch (cell.getCellType()) { case STRING: if (cell.getRichStringCellValue().length() > 0) { return false; } break; case NUMERIC: if (DateUtil.isCellDateFormatted(cell) ? cell.getDateCellValue() != null : cell.getNumericCellValue() != 0) { return false; } break; case BOOLEAN: if (cell.getBooleanCellValue()) { return false; } break; case BLANK: return true; default: break; } return null; } private ColumnType getColumnType(Cell cell) { switch (cell.getCellType()) { case STRING: return ColumnType.STRING; case NUMERIC: return DateUtil.isCellDateFormatted(cell) ? ColumnType.LOCAL_DATE_TIME : ColumnType.INTEGER; case BOOLEAN: return ColumnType.BOOLEAN; default: break; } return null; } private static class TableRange { private int startRow, endRow, startColumn, endColumn; TableRange(int startRow, int endRow, int startColumn, int endColumn) { this.startRow = startRow; this.endRow = endRow; this.startColumn = startColumn; this.endColumn = endColumn; } } private TableRange findTableArea(Sheet sheet) { // find first row and column with contents int row1 = -1; int row2 = -1; TableRange lastRowArea = null; for (Row row : sheet) { TableRange rowArea = findRowArea(row); if (lastRowArea == null && rowArea != null) { if (row1 < 0) { lastRowArea = rowArea; row1 = row.getRowNum(); row2 = row1; } } else if (lastRowArea != null && rowArea == null) { if (row2 > row1) { break; } else { row1 = -1; } } else if (lastRowArea == null && rowArea == null) { row1 = -1; } else if (rowArea.startColumn < lastRowArea.startColumn || rowArea.endColumn > lastRowArea.endColumn) { lastRowArea = null; row2 = -1; } else { row2 = row.getRowNum(); } } return row1 >= 0 && lastRowArea != null ? new TableRange(row1, row2, lastRowArea.startColumn, lastRowArea.endColumn) : null; } private TableRange findRowArea(Row row) { int col1 = -1; int col2 = -1; for (Cell cell : row) { Boolean blank = isBlank(cell); if (col1 < 0 && Boolean.FALSE.equals(blank)) { col1 = cell.getColumnIndex(); col2 = col1; } else if (col1 >= 0 && col2 >= col1) { if (Boolean.FALSE.equals(blank)) { col2 = cell.getColumnIndex(); } else if (Boolean.TRUE.equals(blank)) { break; } } } return col1 >= 0 && col2 >= col1 ? new TableRange(0, 0, col1, col2) : null; } private InputStream getInputStream(XlsxReadOptions options, byte[] bytes) throws FileNotFoundException { if (bytes != null) { return new ByteArrayInputStream(bytes); } if (options.source().inputStream() != null) { return options.source().inputStream(); } return new FileInputStream(options.source().file()); } private Table createTable(Sheet sheet, TableRange tableArea, XlsxReadOptions options) { // assume header row if all cells are of type String Row row = sheet.getRow(tableArea.startRow); List headerNames = new ArrayList<>(); for (Cell cell : row) { if (cell.getCellType() == CellType.STRING) { headerNames.add(cell.getRichStringCellValue().getString()); } else { break; } } if (headerNames.size() == tableArea.endColumn - tableArea.startColumn + 1) { tableArea.startRow++; } else { headerNames.clear(); for (int col = tableArea.startColumn; col <= tableArea.endColumn; col++) { headerNames.add("col" + col); } } Table table = Table.create(options.tableName()); List> columns = new ArrayList<>(Collections.nCopies(headerNames.size(), null)); for (int rowNum = tableArea.startRow; rowNum <= tableArea.endRow; rowNum++) { row = sheet.getRow(rowNum); for (int colNum = 0; colNum < headerNames.size(); colNum++) { Cell cell = row.getCell(colNum + tableArea.startColumn, MissingCellPolicy.RETURN_BLANK_AS_NULL); Column column = columns.get(colNum); if (cell != null) { if (column == null) { column = createColumn(headerNames.get(colNum), cell); columns.set(colNum, column); while (column.size() < rowNum - tableArea.startRow) { column.appendMissing(); } } Column altColumn = appendValue(column, cell); if (altColumn != null && altColumn != column) { column = altColumn; columns.set(colNum, column); } } if (column != null) { while (column.size() <= rowNum - tableArea.startRow) { column.appendMissing(); } } } } columns.removeAll(Collections.singleton(null)); table.addColumns(columns.toArray(new Column[columns.size()])); return table; } @SuppressWarnings("unchecked") private Column appendValue(Column column, Cell cell) { switch (cell.getCellType()) { case STRING: column.appendCell(cell.getRichStringCellValue().getString()); return null; case NUMERIC: if (DateUtil.isCellDateFormatted(cell)) { Date date = cell.getDateCellValue(); // This will return inconsistent results across time zones, but that matches Excel's behavior LocalDateTime localDate = date.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); column.appendCell(localDate.toString()); return null; } else { double num = cell.getNumericCellValue(); if (column.type() == ColumnType.INTEGER) { Column intColumn = (Column) column; if ((int) num == num) { intColumn.append((int) num); return null; } else if ((long) num == num) { Column altColumn = LongColumn.create(column.name(), column.size()); altColumn = intColumn.mapInto(s -> (long) s, altColumn); altColumn.append((long) num); return altColumn; } else { Column altColumn = DoubleColumn.create(column.name(), column.size()); altColumn = intColumn.mapInto(s -> (double) s, altColumn); altColumn.append(num); return altColumn; } } else if (column.type() == ColumnType.LONG) { Column longColumn = (Column) column; if ((long) num == num) { longColumn.append((long) num); return null; } else { Column altColumn = DoubleColumn.create(column.name(), column.size()); altColumn = longColumn.mapInto(s -> (double) s, altColumn); altColumn.append(num); return altColumn; } } else if (column.type() == ColumnType.DOUBLE) { Column doubleColumn = (Column) column; doubleColumn.append(num); return null; } } break; case BOOLEAN: if (column.type() == ColumnType.BOOLEAN) { Column booleanColumn = (Column) column; booleanColumn.append(cell.getBooleanCellValue()); return null; } default: break; } return null; } private Column createColumn(String name, Cell cell) { Column column; ColumnType columnType = getColumnType(cell); if (columnType == null) { columnType = ColumnType.STRING; } column = columnType.create(name); return column; } @Override public Table read(Source source) throws IOException { return read(XlsxReadOptions.builder(source).build()); } }