io.questdb.cutlass.text.TextMetadataDetector Maven / Gradle / Ivy
* ___ _ ____ ____
* / _ \ _ _ ___ ___| |_| _ \| __ )
* | | | | | | |/ _ \/ __| __| | | | _ \
* | |_| | |_| | __/\__ \ |_| |_| | |_) |
* \__\_\\__,_|\___||___/\__|____/|____/
* Copyright (c) 2014-2019 Appsicle
* Copyright (c) 2019-2020 QuestDB
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package io.questdb.cutlass.text;
import io.questdb.cairo.ColumnType;
import io.questdb.cutlass.text.types.TypeAdapter;
import io.questdb.cutlass.text.types.TypeManager;
import io.questdb.log.Log;
import io.questdb.log.LogFactory;
import io.questdb.std.*;
import io.questdb.std.str.DirectByteCharSequence;
import io.questdb.std.str.DirectCharSink;
import io.questdb.std.str.StringSink;
public class TextMetadataDetector implements TextLexer.Listener, Mutable, Closeable {
private static final Log LOG = LogFactory.getLog(TextMetadataDetector.class);
private final StringSink tempSink = new StringSink();
private final ObjList columnTypes = new ObjList<>();
private final ObjList columnNames = new ObjList<>();
private final IntList _blanks = new IntList();
private final IntList _histogram = new IntList();
private final CharSequenceObjHashMap schemaColumns = new CharSequenceObjHashMap<>();
private final TypeManager typeManager;
private final DirectCharSink utf8Sink;
private int fieldCount;
private boolean header = false;
private boolean forceHeader = false;
private CharSequence tableName;
public TextMetadataDetector(
TypeManager typeManager,
TextConfiguration textConfiguration
) {
this.typeManager = typeManager;
this.utf8Sink = new DirectCharSink(textConfiguration.getUtf8SinkSize());
public void clear() {
fieldCount = 0;
header = false;
forceHeader = false;
public void close() {;
public void evaluateResults(long lineCount, long errorCount) {
// try calculate types counting all rows
// if all types come up as strings, reduce lineCount by one and retry
// if some fields come up as non-string after subtracting row - we have a header
if ((calcTypes(lineCount - errorCount, true) && !calcTypes(lineCount - errorCount - 1, false)) || forceHeader) {
// copy headers
header = true;
} else {
.$("no header [table=").$(tableName)
.$(", lineCount=").$(lineCount)
.$(", errorCount=").$(errorCount)
.$(", forceHeader=").$(forceHeader)
// make up field names if there is no header
for (int i = 0; i < fieldCount; i++) {
if (!header || columnNames.getQuick(i).length() == 0) {
columnNames.setQuick(i, tempSink.toString());
// override calculated types with user-supplied information
if (schemaColumns.size() > 0) {
for (int i = 0, k = columnNames.size(); i < k; i++) {
TypeAdapter type = schemaColumns.get(columnNames.getQuick(i));
if (type != null) {
columnTypes.setQuick(i, type);
public boolean isHeader() {
return header;
public void of(ObjList names, ObjList types, boolean forceHeader) {
if (names != null && types != null) {
final int n = names.size();
assert n == types.size();
for (int i = 0; i < n; i++) {
schemaColumns.put(names.getQuick(i), types.getQuick(i));
this.forceHeader = forceHeader;
public void onFields(long line, ObjList values, int fieldCount) {
// keep first line in case its a header
if (line == 0) {
stashPossibleHeader(values, fieldCount);
int count = typeManager.getProbeCount();
for (int i = 0; i < fieldCount; i++) {
DirectByteCharSequence cs = values.getQuick(i);
if (cs.length() == 0) {
int offset = i * count;
for (int k = 0; k < count; k++) {
final TypeAdapter probe = typeManager.getProbe(k);
if (probe.probe(cs)) {
_histogram.increment(k + offset);
* Histogram contains counts for every probe that validates field. It is possible for multiple probes to validate same field.
* It can happen because of two reasons.
* probes are compatible, for example INT is compatible with DOUBLE in a sense that DOUBLE probe will positively
* validate every INT. If this the case we will use order of probes as priority. First probe wins
* it is possible to have mixed types in same column, in which case column has to become string.
* to establish if we have mixed column we check if probe count + blank values add up to total number of rows.
private boolean calcTypes(long count, boolean setDefault) {
boolean allStrings = true;
int probeCount = typeManager.getProbeCount();
for (int i = 0; i < fieldCount; i++) {
int offset = i * probeCount;
int blanks = _blanks.getQuick(i);
boolean unprobed = true;
for (int k = 0; k < probeCount; k++) {
if (_histogram.getQuick(k + offset) + blanks == count && blanks < count) {
unprobed = false;
columnTypes.setQuick(i, typeManager.getProbe(k));
if (allStrings) {
allStrings = false;
if (setDefault && unprobed) {
columnTypes.setQuick(i, typeManager.getTypeAdapter(ColumnType.STRING));
return allStrings;
ObjList getColumnNames() {
return columnNames;
ObjList getColumnTypes() {
return columnTypes;
// metadata detector is essentially part of text lexer
// we can potentially keep a cache of char sequences until the whole
// system is reset, similar to flyweight char sequence over array of chars
private String normalise(CharSequence seq) {
boolean capNext = false;
for (int i = 0, l = seq.length(); i < l; i++) {
char c = seq.charAt(i);
switch (c) {
case ' ':
case '_':
case '?':
case '.':
case ',':
case '\'':
case '\"':
case '\\':
case '/':
case '\0':
case ':':
case ')':
case '(':
case '+':
case '-':
case '*':
case '%':
case '~':
capNext = true;
case 0xfeff: // UTF-8 BOM (Byte Order Mark) can appear at the beginning of a character stream
if (tempSink.length() == 0 && Character.isDigit(c)) {
if (capNext) {
capNext = false;
} else {
return Chars.toString(tempSink);
private void seedFields(int count) {
this._histogram.setAll((fieldCount = count) * typeManager.getProbeCount(), 0);
this._blanks.setAll(count, 0);
this.columnTypes.extendAndSet(count - 1, null);
this.columnNames.setAll(count, "");
void setTableName(CharSequence tableName) {
this.tableName = tableName;
private void stashPossibleHeader(ObjList values, int hi) {
for (int i = 0; i < hi; i++) {
DirectByteCharSequence value = values.getQuick(i);
if (Chars.utf8Decode(value.getLo(), value.getHi(), utf8Sink)) {
columnNames.setQuick(i, normalise(utf8Sink));
} else {$("utf8 error [table=").$(tableName).$(", line=0, col=").$(i).$(']').$();