org.apache.tika.parser.dbf.DBFReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-parsers Show documentation
There is a newer version: 3.0.0-BETA2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dbf;

import org.apache.tika.exception.TikaException;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

/**
 * This reads many dbase3 file variants (not DBASE 7, yet!).
 * This parses the header on open.  The client
 * should get a row and then iterate until next() returns null.
 * Be careful to deepCopy the row (if caching) because the row
 * is mutable and will change as the reader iterates over new rows.
 * 
 * This is based on: 
 * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm
 * 

 * This is designed to separate out Tika-specific code so that it can
 * be copied/pasted as a standalone if desired.
 */

class DBFReader {

    public static final int MAX_FIELD_LENGTH = 66000;
    public static boolean STRICT = false;


    enum Version {

        FOXBASE(0x02, "FoxBASE", ""),
        FOXBASE_PLUS(0x03, "FoxBASE_plus", ""),
        VISUAL_FOXPRO(0x30, "Visual_FoxPro", ""),
        VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro", "autoincrement"),
        VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro", "Varchar_or_Varbinary"),
        DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL", "table"),
        DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL", "system"),
        FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus", "memo"),
        DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV", "memo"),
        DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL", "table_with_memo"),
        FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x", "memo"),
        HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz", "SMT_memo"),
        FOXBASE2(0xFB, "FoxBASE", "");

        private final int id;
        private final String format;
        private final String type;

        Version(int id, String format, String type) {
            this.id = id;
            this.format = format;
            this.type = type;
        }

        int getId() {
            return id;
        }

        String getFormat() {
            return format;
        }

        String getType() {
            return type;
        }

        String getFullMimeString() {
            StringBuilder sb = new StringBuilder();
            sb.append("application/x-dbf; ").append("format=").append(getFormat());
            if (!"".equals(type)) {
                sb.append("; type=").append(getType());
            }
            return sb.toString();
        }
    }

    ;

    private static final Map VERSION_MAP = new ConcurrentHashMap<>();

    static {
        for (Version version : Version.values()) {
            VERSION_MAP.put(version.id, version);
        }
    }

    static DBFReader open(InputStream is) throws IOException, TikaException {
        return new DBFReader(is);
    }

    //can return null!
    static Version getVersion(int b) {
        return VERSION_MAP.get(b);
    }

    private final DBFFileHeader header;
    private final InputStream is;
    private DBFRow currRow = null;
    private Charset charset = StandardCharsets.US_ASCII;

    private DBFReader(InputStream is) throws IOException, TikaException {
        header = DBFFileHeader.parse(is);
        this.is = is;
        currRow = new DBFRow(header);
    }


    /**
     * Iterate through the rows with this.
     * 
     * Be careful: the reader reuses the row!  Make sure to call deep copy
     * if you are buffering rows.
     *
     * @return
     * @throws IOException
     * @throws TikaException
     */
    DBFRow next() throws IOException, TikaException {
        if (fillRow(currRow)) {
            return currRow;
        }
        return null;
    }

    //returns whether or not some content was read.
    //it might not be complete!
    private boolean fillRow(DBFRow row) throws IOException, TikaException {
        if (row == null) {
            return false;
        }
        DBFCell[] cells = row.cells;
        int isDeletedByte = is.read();
        boolean isDeleted = false;
        if (isDeletedByte == 32) {
            //all ok
        } else if (isDeletedByte == 42) {//asterisk
            isDeleted = true;
        } else if (isDeletedByte == 26) {//marker for end of dbf file
            return false;
        } else if (isDeletedByte == -1) {//truncated file
            if (DBFReader.STRICT) {
                throw new IOException("EOF reached too early");
            }
            return false;
        } else {
            throw new TikaException("Expecting space or asterisk at beginning of record, not:" + isDeletedByte);
        }
        row.setDeleted(isDeleted);

        boolean readSomeContent = false;
        for (int i = 0; i < cells.length; i++) {
            if (cells[i].read(is)) {
                readSomeContent = true;
            }
        }
        return readSomeContent;
    }

    public DBFFileHeader getHeader() {
        return header;
    }

    public Charset getCharset() {
        return charset;
    }

    /**
     * removes trailing 0 from byte array
     *
     * @param bytes
     * @return
     */
    public static byte[] trim(byte[] bytes) {
        int end = bytes.length - 1;
        for (int i = end; i > -1; i--) {
            if (bytes[i] != 0) {
                end = i;
                break;
            }
        }
        if (end == bytes.length - 1) {
            return bytes;
        }
        byte[] ret = new byte[end + 1];
        System.arraycopy(bytes, 0, ret, 0, end + 1);
        return ret;
    }
}