All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.dbf.DBFReader Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dbf;

import org.apache.tika.exception.TikaException;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

/**
 * This reads many dbase3 file variants (not DBASE 7, yet!).
 * This parses the header on open.  The client
 * should get a row and then iterate until next() returns null.
 * Be careful to deepCopy the row (if caching) because the row
 * is mutable and will change as the reader iterates over new rows.
 * 

* This is based on: * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm *

* This is designed to separate out Tika-specific code so that it can * be copied/pasted as a standalone if desired. */ class DBFReader { public static final int MAX_FIELD_LENGTH = 66000; public static boolean STRICT = false; enum Version { FOXBASE(0x02, "FoxBASE", ""), FOXBASE_PLUS(0x03, "FoxBASE_plus", ""), VISUAL_FOXPRO(0x30, "Visual_FoxPro", ""), VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro", "autoincrement"), VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro", "Varchar_or_Varbinary"), DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL", "table"), DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL", "system"), FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus", "memo"), DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV", "memo"), DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL", "table_with_memo"), FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x", "memo"), HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz", "SMT_memo"), FOXBASE2(0xFB, "FoxBASE", ""); private final int id; private final String format; private final String type; Version(int id, String format, String type) { this.id = id; this.format = format; this.type = type; } int getId() { return id; } String getFormat() { return format; } String getType() { return type; } String getFullMimeString() { StringBuilder sb = new StringBuilder(); sb.append("application/x-dbf; ").append("format=").append(getFormat()); if (!"".equals(type)) { sb.append("; type=").append(getType()); } return sb.toString(); } } ; private static final Map VERSION_MAP = new ConcurrentHashMap<>(); static { for (Version version : Version.values()) { VERSION_MAP.put(version.id, version); } } static DBFReader open(InputStream is) throws IOException, TikaException { return new DBFReader(is); } //can return null! static Version getVersion(int b) { return VERSION_MAP.get(b); } private final DBFFileHeader header; private final InputStream is; private DBFRow currRow = null; private Charset charset = StandardCharsets.US_ASCII; private DBFReader(InputStream is) throws IOException, TikaException { header = DBFFileHeader.parse(is); this.is = is; currRow = new DBFRow(header); } /** * Iterate through the rows with this. *

* Be careful: the reader reuses the row! Make sure to call deep copy * if you are buffering rows. * * @return * @throws IOException * @throws TikaException */ DBFRow next() throws IOException, TikaException { if (fillRow(currRow)) { return currRow; } return null; } //returns whether or not some content was read. //it might not be complete! private boolean fillRow(DBFRow row) throws IOException, TikaException { if (row == null) { return false; } DBFCell[] cells = row.cells; int isDeletedByte = is.read(); boolean isDeleted = false; if (isDeletedByte == 32) { //all ok } else if (isDeletedByte == 42) {//asterisk isDeleted = true; } else if (isDeletedByte == 26) {//marker for end of dbf file return false; } else if (isDeletedByte == -1) {//truncated file if (DBFReader.STRICT) { throw new IOException("EOF reached too early"); } return false; } else { throw new TikaException("Expecting space or asterisk at beginning of record, not:" + isDeletedByte); } row.setDeleted(isDeleted); boolean readSomeContent = false; for (int i = 0; i < cells.length; i++) { if (cells[i].read(is)) { readSomeContent = true; } } return readSomeContent; } public DBFFileHeader getHeader() { return header; } public Charset getCharset() { return charset; } /** * removes trailing 0 from byte array * * @param bytes * @return */ public static byte[] trim(byte[] bytes) { int end = bytes.length - 1; for (int i = end; i > -1; i--) { if (bytes[i] != 0) { end = i; break; } } if (end == bytes.length - 1) { return bytes; } byte[] ret = new byte[end + 1]; System.arraycopy(bytes, 0, ret, 0, end + 1); return ret; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy