org.apache.tika.parser.dbf.DBFReader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dbf;
import org.apache.tika.exception.TikaException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* This reads many dbase3 file variants (not DBASE 7, yet!).
* This parses the header on open. The client
* should get a row and then iterate until next() returns null.
* Be careful to deepCopy the row (if caching) because the row
* is mutable and will change as the reader iterates over new rows.
*
* This is based on:
* http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm
*
* This is designed to separate out Tika-specific code so that it can
* be copied/pasted as a standalone if desired.
*/
class DBFReader {
public static final int MAX_FIELD_LENGTH = 66000;
public static boolean STRICT = false;
enum Version {
FOXBASE(0x02, "FoxBASE", ""),
FOXBASE_PLUS(0x03, "FoxBASE_plus", ""),
VISUAL_FOXPRO(0x30, "Visual_FoxPro", ""),
VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro", "autoincrement"),
VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro", "Varchar_or_Varbinary"),
DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL", "table"),
DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL", "system"),
FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus", "memo"),
DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV", "memo"),
DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL", "table_with_memo"),
FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x", "memo"),
HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz", "SMT_memo"),
FOXBASE2(0xFB, "FoxBASE", "");
private final int id;
private final String format;
private final String type;
Version(int id, String format, String type) {
this.id = id;
this.format = format;
this.type = type;
}
int getId() {
return id;
}
String getFormat() {
return format;
}
String getType() {
return type;
}
String getFullMimeString() {
StringBuilder sb = new StringBuilder();
sb.append("application/x-dbf; ").append("format=").append(getFormat());
if (!"".equals(type)) {
sb.append("; type=").append(getType());
}
return sb.toString();
}
}
;
private static final Map VERSION_MAP = new ConcurrentHashMap<>();
static {
for (Version version : Version.values()) {
VERSION_MAP.put(version.id, version);
}
}
static DBFReader open(InputStream is) throws IOException, TikaException {
return new DBFReader(is);
}
//can return null!
static Version getVersion(int b) {
return VERSION_MAP.get(b);
}
private final DBFFileHeader header;
private final InputStream is;
private DBFRow currRow = null;
private Charset charset = StandardCharsets.US_ASCII;
private DBFReader(InputStream is) throws IOException, TikaException {
header = DBFFileHeader.parse(is);
this.is = is;
currRow = new DBFRow(header);
}
/**
* Iterate through the rows with this.
*
* Be careful: the reader reuses the row! Make sure to call deep copy
* if you are buffering rows.
*
* @return
* @throws IOException
* @throws TikaException
*/
DBFRow next() throws IOException, TikaException {
if (fillRow(currRow)) {
return currRow;
}
return null;
}
//returns whether or not some content was read.
//it might not be complete!
private boolean fillRow(DBFRow row) throws IOException, TikaException {
if (row == null) {
return false;
}
DBFCell[] cells = row.cells;
int isDeletedByte = is.read();
boolean isDeleted = false;
if (isDeletedByte == 32) {
//all ok
} else if (isDeletedByte == 42) {//asterisk
isDeleted = true;
} else if (isDeletedByte == 26) {//marker for end of dbf file
return false;
} else if (isDeletedByte == -1) {//truncated file
if (DBFReader.STRICT) {
throw new IOException("EOF reached too early");
}
return false;
} else {
throw new TikaException("Expecting space or asterisk at beginning of record, not:" + isDeletedByte);
}
row.setDeleted(isDeleted);
boolean readSomeContent = false;
for (int i = 0; i < cells.length; i++) {
if (cells[i].read(is)) {
readSomeContent = true;
}
}
return readSomeContent;
}
public DBFFileHeader getHeader() {
return header;
}
public Charset getCharset() {
return charset;
}
/**
* removes trailing 0 from byte array
*
* @param bytes
* @return
*/
public static byte[] trim(byte[] bytes) {
int end = bytes.length - 1;
for (int i = end; i > -1; i--) {
if (bytes[i] != 0) {
end = i;
break;
}
}
if (end == bytes.length - 1) {
return bytes;
}
byte[] ret = new byte[end + 1];
System.arraycopy(bytes, 0, ret, 0, end + 1);
return ret;
}
}