All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.calcite.adapter.file.FileRowConverter Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to you under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.calcite.adapter.file;

import org.apache.calcite.adapter.java.JavaTypeFactory;
import org.apache.calcite.avatica.util.DateTimeUtils;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.util.Pair;

import com.google.common.collect.ImmutableMap;
import com.joestelmach.natty.DateGroup;
import com.joestelmach.natty.Parser;

import org.checkerframework.checker.nullness.qual.Nullable;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.calcite.util.Util.first;

import static java.lang.Boolean.parseBoolean;
import static java.lang.Byte.parseByte;
import static java.util.Objects.requireNonNull;

/**
 * FileRowConverter.
 */
class FileRowConverter {

  // cache for lazy initialization
  private final FileReader fileReader;
  private final @Nullable List> fieldConfigs;
  private boolean initialized = false;

  // row parser configuration
  private final List fields = new ArrayList<>();

  /** Format for parsing numbers. Not thread-safe, but we assume that only
   * one thread uses this converter at a time. */
  private final NumberFormat numberFormat =
      NumberFormat.getInstance(Locale.ROOT);

  /** Format for parsing integers. Not thread-safe, but we assume that only
   * one thread uses this converter at a time. */
  private final NumberFormat integerFormat =
      NumberFormat.getIntegerInstance(Locale.ROOT);

  /** Creates a FileRowConverter. */
  FileRowConverter(FileReader fileReader,
      List> fieldConfigs) {
    this.fileReader = fileReader;
    this.fieldConfigs = fieldConfigs;
  }

  // initialize() - combine HTML table header information with field definitions
  //      to initialize the table reader
  // NB:  object initialization is deferred to avoid unnecessary URL reads
  private void initialize() {
    if (this.initialized) {
      return;
    }
    try {
      final Elements headerElements = this.fileReader.getHeadings();

      // create a name to index map for HTML table elements
      final Map headerMap = new LinkedHashMap<>();
      int i = 0;
      for (Element th : headerElements) {
        String heading = th.text();
        if (headerMap.containsKey(heading)) {
          throw new Exception("duplicate heading: '" + heading + "'");
        }
        headerMap.put(heading, i++);
      }

      // instantiate the field definitions
      final Set colNames = new HashSet<>();
      final Set sources = new HashSet<>();
      if (this.fieldConfigs != null) {
        try {
          for (Map fieldConfig : this.fieldConfigs) {

            String thName = (String) fieldConfig.get("th");
            String name = thName;
            String newName;
            FileFieldType type = null;
            boolean skip = false;

            if (!headerMap.containsKey(thName)) {
              throw new Exception("bad source column name: '" + thName + "'");
            }
            if ((newName = (String) fieldConfig.get("name")) != null) {
              name = newName;
            }
            if (colNames.contains(name)) {
              throw new Exception("duplicate column name: '" + name + "'");
            }

            String typeString = (String) fieldConfig.get("type");
            if (typeString != null) {
              type = FileFieldType.of(typeString);
            }

            String sSkip = (String) fieldConfig.get("skip");
            if (sSkip != null) {
              skip = parseBoolean(sSkip);
            }

            Integer sourceIx = headerMap.get(thName);
            colNames.add(name);
            sources.add(thName);
            if (!skip) {
              addFieldDef(name, type, fieldConfig, sourceIx);
            }
          }
        } catch (RuntimeException e) {
          throw e;
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      // pick up any data elements not explicitly defined
      for (Map.Entry e : headerMap.entrySet()) {
        final String name = e.getKey();
        if (!sources.contains(name) && !colNames.contains(name)) {
          addFieldDef(name, null, ImmutableMap.of(), e.getValue());
        }
      }

    } catch (RuntimeException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
    this.initialized = true;
  }

  // add another field definition to the FileRowConverter during initialization
  private void addFieldDef(String name, @Nullable FileFieldType type,
      Map config, int sourceCol) {
    this.fields.add(new FieldDef(name, type, config, sourceCol));
  }

  /** Converts a row of JSoup Elements to an array of java objects. */
  Object toRow(Elements rowElements, int[] projection) {
    initialize();
    final Object[] objects = new Object[projection.length];

    for (int i = 0; i < projection.length; i++) {
      int field = projection[i];
      objects[i] = this.fields.get(field).convert(rowElements);
    }
    return objects;
  }

  int width() {
    initialize();
    return this.fields.size();
  }

  RelDataType getRowType(JavaTypeFactory typeFactory) {
    initialize();
    List names = new ArrayList<>();
    List types = new ArrayList<>();

    // iterate through FieldDefs, populating names and types
    for (FieldDef f : this.fields) {
      names.add(f.getName());

      @Nullable FileFieldType fieldType = f.getType();
      RelDataType type;

      if (fieldType == null) {
        type = typeFactory.createJavaType(String.class);
      } else {
        type = fieldType.toType(typeFactory);
      }

      types.add(type);
    }

    if (names.isEmpty()) {
      names.add("line");
      types.add(typeFactory.createJavaType(String.class));
    }

    return typeFactory.createStructType(Pair.zip(names, types));
  }

  /** Parses an HTML table cell. */
  private static class CellReader {
    private final String selector;
    private final @Nullable Integer selectedElement;
    private final @Nullable Pattern replacePattern;
    private final String replaceWith;
    private final @Nullable Pattern matchPattern;
    private final int matchSeq;

    CellReader(Map config) {
      final @Nullable String unusedType = (String) config.get("type");
      this.selector = first((String) config.get("selector"), "*");
      this.selectedElement = (Integer) config.get("selectedElement");
      @Nullable String replace = (String) config.get("replace");
      this.replacePattern = replace == null ? null : Pattern.compile(replace);
      this.replaceWith = first((String) config.get("replaceWith"), "");
      @Nullable String match = (String) config.get("match");
      this.matchPattern = match == null ? null : Pattern.compile(match);
      this.matchSeq = first((Integer) config.get("matchSeq"), 0);
    }

    @Nullable String read(Element cell) {
      ArrayList cellText = new ArrayList<>();

      if (this.selectedElement != null) {
        cellText.add(cell.select(this.selector)
            .get(this.selectedElement).ownText());
      } else {
        for (Element child : cell.select(this.selector)) {
          // String tagName = child.tag().getName();
          cellText.add(child.ownText());
        }
      }

      String cellString = String.join(" ", cellText).trim();

      // replace
      if (this.replacePattern != null) {
        Matcher m = this.replacePattern.matcher(cellString);
        cellString = m.replaceAll(this.replaceWith);
      }

      // match
      if (this.matchPattern == null) {
        return cellString;
      } else {
        List allMatches = new ArrayList<>();
        Matcher m = this.matchPattern.matcher(cellString);
        while (m.find()) {
          allMatches.add(m.group());
        }
        if (!allMatches.isEmpty()) {
          return allMatches.get(this.matchSeq);
        } else {
          return null;
        }
      }
    }
  }

  /** Responsible for managing field (column) definition,
   * and for converting an Element to a java data type. */
  private class FieldDef {
    final String name;
    final @Nullable FileFieldType type;
    final Map config;
    final CellReader cellReader;
    final int cellSeq;

    FieldDef(String name, @Nullable FileFieldType type,
        Map config, int cellSeq) {
      this.name = requireNonNull(name, "name");
      this.type = type;
      this.config = requireNonNull(config, "config");
      this.cellReader = new CellReader(config);
      this.cellSeq = cellSeq;
    }

    @Nullable Object convert(Elements row) {
      return toObject(this.type, this.cellReader.read(row.get(this.cellSeq)));
    }

    public String getName() {
      return this.name;
    }

    @Nullable FileFieldType getType() {
      return this.type;
    }

    private java.util.Date parseDate(String string) {
      Parser parser = new Parser(DateTimeUtils.UTC_ZONE);
      List groups = parser.parse(string);
      DateGroup group = groups.get(0);
      return group.getDates().get(0);
    }

    @SuppressWarnings("JavaUtilDate")
    private @Nullable Object toObject(@Nullable FileFieldType fieldType,
        @Nullable String string) {
      if (string == null || string.isEmpty()) {
        return null;
      }

      if (fieldType == null) {
        return string;
      }

      switch (fieldType) {
      case BOOLEAN:
        return parseBoolean(string);

      case BYTE:
        return parseByte(string);

      case SHORT:
        try {
          return integerFormat.parse(string).shortValue();
        } catch (ParseException e) {
          return null;
        }

      case INT:
        try {
          return integerFormat.parse(string).intValue();
        } catch (ParseException e) {
          return null;
        }

      case LONG:
        try {
          return numberFormat.parse(string).longValue();
        } catch (ParseException e) {
          return null;
        }

      case FLOAT:
        try {
          return numberFormat.parse(string).floatValue();
        } catch (ParseException e) {
          return null;
        }

      case DOUBLE:
        try {
          return numberFormat.parse(string).doubleValue();
        } catch (ParseException e) {
          return null;
        }

      case DATE:
        return new java.sql.Date(parseDate(string).getTime());

      case TIME:
        return new java.sql.Time(parseDate(string).getTime());

      case TIMESTAMP:
        return new java.sql.Timestamp(parseDate(string).getTime());

      case STRING:
      default:
        return string;
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy