All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.dbf.DBFFileHeader Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dbf;

import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.*;

class DBFFileHeader {

    private DBFReader.Version version;
    private Calendar lastModified;
    private int numRecords = -1;
    private short numBytesInHeader;
    private short numBytesInRecord;
    private DBFColumnHeader[] cols;

    public static DBFFileHeader parse(InputStream is) throws IOException, TikaException {
        DBFFileHeader header = new DBFFileHeader();

        int firstByte = is.read();
        header.version = DBFReader.getVersion(firstByte);
        if (header.version == null) {
            throw new TikaException("Unrecognized first byte in DBFFile: " + firstByte);
        }
        int lastModYear = is.read();
        int lastModMonth = is.read();
        int lastModDay = is.read();
        Calendar now = GregorianCalendar.getInstance(
                TimeZone.getTimeZone("UTC"), Locale.ROOT);

        //if this was last modified after the current year, assume
        //the file was created in 1900
        if (lastModYear + 2000 > now.get(Calendar.YEAR)) {
            lastModYear += 1900;
        } else {
            lastModYear += 2000;
        }
        Calendar lastModified = new GregorianCalendar(
                TimeZone.getTimeZone("UTC"), Locale.ROOT);
        lastModified.set(lastModYear, lastModMonth - 1, lastModDay,0,0,0);
        header.lastModified = lastModified;

        header.numRecords = EndianUtils.readIntLE(is);
        header.numBytesInHeader = EndianUtils.readShortLE(is);
        header.numBytesInRecord = EndianUtils.readShortLE(is);
        IOUtils.skipFully(is, 20);//TODO: can get useful info out of here

        int numCols = 0;//(header.numBytesInHeader - 32) / 32;
        List headers = new LinkedList<>();
        int bytesAccountedFor = 0;
        while (true) {
            DBFColumnHeader colHeader = readCol(is);
            bytesAccountedFor += colHeader.fieldLength;
            numCols++;
            headers.add(colHeader);
            if (bytesAccountedFor >= header.numBytesInRecord-1) {
                break;
            }
        }

        header.cols = headers.toArray(new DBFColumnHeader[headers.size()]);

        int endOfHeader = is.read();
        if (endOfHeader != 13) {
            throw new TikaException("Expected new line at end of header");
        }
        long totalReadSoFar = 32 + (numCols * 32) + 1;
        //there can be extra bytes in the header
        long extraHeaderBytes = header.numBytesInHeader - totalReadSoFar;
        IOUtils.skipFully(is, extraHeaderBytes);
        return header;
    }

    private static DBFColumnHeader readCol(InputStream is) throws IOException, TikaException {
        byte[] fieldRecord = new byte[32];
        IOUtils.readFully(is, fieldRecord);

        DBFColumnHeader col = new DBFColumnHeader();
        col.name = new byte[11];
        System.arraycopy(fieldRecord, 0, col.name, 0, 10);

        int colType = fieldRecord[11] & 0xFF;
        if (colType < 0) {
            throw new IOException("File truncated before coltype in header");
        }
        col.setType(colType);
        col.fieldLength = fieldRecord[16] & 0xFF;
        if (col.fieldLength < 0) {
            throw new TikaException("Field length for column "+col.getName(StandardCharsets.US_ASCII)+" is < 0");
        } else if (col.fieldLength > DBFReader.MAX_FIELD_LENGTH) {
            throw new TikaException("Field length ("+col.fieldLength+") is greater than DBReader.MAX_FIELD_LENGTH ("+
                    DBFReader.MAX_FIELD_LENGTH+")");
        }
        col.decimalCount = fieldRecord[17] & 0xFF;
        return col;
    }

    DBFColumnHeader[] getCols() {
        return cols;
    }

    int getNumRecords() {
        return numRecords;
    }

    Calendar getLastModified() {
        return lastModified;
    }

    DBFReader.Version getVersion() {
        return version;
    }

    @Override
    public String toString() {
        return "DBFFileHeader{" +
                "lastModified=" + lastModified +
                ", numRecords=" + numRecords +
                ", numBytesInHeader=" + numBytesInHeader +
                ", numBytesInRecord=" + numBytesInRecord +
                ", cols=" + Arrays.toString(cols) +
                '}';
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy