All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.cdap.directives.parser.ParseExcel Maven / Gradle / Ivy
/*
* Copyright © 2017-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.directives.parser;
import com.google.common.io.Closeables;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.functions.Types;
import io.cdap.wrangler.api.Arguments;
import io.cdap.wrangler.api.Directive;
import io.cdap.wrangler.api.DirectiveExecutionException;
import io.cdap.wrangler.api.DirectiveParseException;
import io.cdap.wrangler.api.ErrorRowException;
import io.cdap.wrangler.api.ExecutorContext;
import io.cdap.wrangler.api.Optional;
import io.cdap.wrangler.api.Row;
import io.cdap.wrangler.api.annotations.Categories;
import io.cdap.wrangler.api.lineage.Lineage;
import io.cdap.wrangler.api.lineage.Many;
import io.cdap.wrangler.api.lineage.Mutation;
import io.cdap.wrangler.api.parser.ColumnName;
import io.cdap.wrangler.api.parser.Text;
import io.cdap.wrangler.api.parser.TokenType;
import io.cdap.wrangler.api.parser.UsageDefinition;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/**
* A step to parse Excel files.
*/
@Plugin(type = Directive.TYPE)
@Name("parse-as-excel")
@Categories(categories = { "parser", "excel"})
@Description("Parses column as Excel file.")
public class ParseExcel implements Directive, Lineage {
public static final String NAME = "parse-as-excel";
private static final Logger LOG = LoggerFactory.getLogger(ParseExcel.class);
private String column;
private String sheet;
private boolean firstRowAsHeader = false;
@Override
public UsageDefinition define() {
UsageDefinition.Builder builder = UsageDefinition.builder(NAME);
builder.define("column", TokenType.COLUMN_NAME);
builder.define("sheet", TokenType.TEXT, Optional.TRUE);
builder.define("first-row-as-header", TokenType.BOOLEAN, Optional.TRUE);
return builder.build();
}
@Override
public void initialize(Arguments args) throws DirectiveParseException {
this.column = ((ColumnName) args.value("column")).value();
if (args.contains("sheet")) {
this.sheet = ((Text) args.value("sheet")).value();
} else {
this.sheet = "0";
}
if (args.contains("first-row-as-header")) {
this.firstRowAsHeader = ((Boolean) args.value("first-row-as-header").value());
}
}
@Override
public void destroy() {
// no-op
}
@Override
public List execute(List records, final ExecutorContext context)
throws DirectiveExecutionException, ErrorRowException {
List results = new ArrayList<>();
ByteArrayInputStream input = null;
try {
for (Row record : records) {
int idx = record.find(column);
if (idx != -1) {
Object object = record.getValue(idx);
byte[] bytes = null;
if (object instanceof byte[]) {
bytes = (byte[]) object;
} else if (object instanceof ByteBuffer) {
ByteBuffer buffer = (ByteBuffer) object;
bytes = new byte[buffer.remaining()];
buffer.get(bytes);
} else {
throw new DirectiveExecutionException(
NAME, String.format("Column '%s' should be of type 'byte array' or 'ByteBuffer'.", column));
}
if (bytes != null) {
input = new ByteArrayInputStream(bytes);
XSSFWorkbook book = new XSSFWorkbook(input);
XSSFSheet excelsheet;
if (Types.isInteger(sheet)) {
excelsheet = book.getSheetAt(Integer.parseInt(sheet));
} else {
excelsheet = book.getSheet(sheet);
}
if (excelsheet == null) {
throw new DirectiveExecutionException(
NAME, String.format("Failed to extract sheet '%s' from the excel. " +
"Sheet '%s' does not exist.", sheet, sheet));
}
Map columnNames = new TreeMap<>();
Iterator it = excelsheet.iterator();
int rows = 0;
while (it.hasNext()) {
org.apache.poi.ss.usermodel.Row row = it.next();
Iterator cellIterator = row.cellIterator();
if (checkIfRowIsEmpty(row)) {
continue;
}
Row newRow = new Row();
newRow.add("fwd", rows);
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
String name = columnName(cell.getAddress().getColumn());
if (firstRowAsHeader && rows > 0) {
String value = columnNames.get(cell.getAddress().getColumn());
if (value != null) {
name = value;
}
}
String value = "";
switch (cell.getCellTypeEnum()) {
case STRING:
newRow.add(name, cell.getStringCellValue());
value = cell.getStringCellValue();
break;
case NUMERIC:
if (HSSFDateUtil.isCellDateFormatted(cell)) {
newRow.add(name, cell.getDateCellValue());
value = cell.getDateCellValue().toString();
} else {
newRow.add(name, cell.getNumericCellValue());
value = String.valueOf(cell.getNumericCellValue());
}
break;
case BOOLEAN:
newRow.add(name, cell.getBooleanCellValue());
value = String.valueOf(cell.getBooleanCellValue());
break;
}
if (rows == 0 && firstRowAsHeader) {
columnNames.put(cell.getAddress().getColumn(), value);
}
}
if (firstRowAsHeader && rows == 0) {
rows++;
continue;
}
results.add(newRow);
rows++;
}
if (firstRowAsHeader) {
rows = rows - 1;
}
for (int i = rows - 1; i >= 0; --i) {
results.get(rows - i - 1).addOrSetAtIndex(1, "bkd", i); // fwd - 0, bkd - 1.
}
}
}
}
} catch (Exception e) {
throw new ErrorRowException(NAME, e.getMessage(), 1);
} finally {
if (input != null) {
Closeables.closeQuietly(input);
}
}
return results;
}
@Override
public Mutation lineage() {
return Mutation.builder()
.readable("Parsed column '%s' as a Excel", column)
.all(Many.columns(column))
.build();
}
private boolean checkIfRowIsEmpty(org.apache.poi.ss.usermodel.Row row) {
if (row == null) {
return true;
}
if (row.getLastCellNum() <= 0) {
return true;
}
for (int cellNum = row.getFirstCellNum(); cellNum < row.getLastCellNum(); cellNum++) {
Cell cell = row.getCell(cellNum);
if (cell != null && cell.getCellTypeEnum() != CellType.BLANK && StringUtils.isNotBlank(cell.toString())) {
return false;
}
}
return true;
}
private String columnName(int number) {
final StringBuilder sb = new StringBuilder();
int num = number;
while (num >= 0) {
int numChar = (num % 26) + 65;
sb.append((char) numChar);
num = (num / 26) - 1;
}
return sb.reverse().toString();
}
}
|