org.apache.tika.parser.dbf.DBFFileHeader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dbf;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.*;
class DBFFileHeader {
private DBFReader.Version version;
private Calendar lastModified;
private int numRecords = -1;
private short numBytesInHeader;
private short numBytesInRecord;
private DBFColumnHeader[] cols;
public static DBFFileHeader parse(InputStream is) throws IOException, TikaException {
DBFFileHeader header = new DBFFileHeader();
int firstByte = is.read();
header.version = DBFReader.getVersion(firstByte);
if (header.version == null) {
throw new TikaException("Unrecognized first byte in DBFFile: " + firstByte);
}
int lastModYear = is.read();
int lastModMonth = is.read();
int lastModDay = is.read();
Calendar now = GregorianCalendar.getInstance(
TimeZone.getTimeZone("UTC"), Locale.ROOT);
//if this was last modified after the current year, assume
//the file was created in 1900
if (lastModYear + 2000 > now.get(Calendar.YEAR)) {
lastModYear += 1900;
} else {
lastModYear += 2000;
}
Calendar lastModified = new GregorianCalendar(
TimeZone.getTimeZone("UTC"), Locale.ROOT);
lastModified.set(lastModYear, lastModMonth - 1, lastModDay,0,0,0);
header.lastModified = lastModified;
header.numRecords = EndianUtils.readIntLE(is);
header.numBytesInHeader = EndianUtils.readShortLE(is);
header.numBytesInRecord = EndianUtils.readShortLE(is);
IOUtils.skipFully(is, 20);//TODO: can get useful info out of here
int numCols = 0;//(header.numBytesInHeader - 32) / 32;
List headers = new LinkedList<>();
int bytesAccountedFor = 0;
while (true) {
DBFColumnHeader colHeader = readCol(is);
bytesAccountedFor += colHeader.fieldLength;
numCols++;
headers.add(colHeader);
if (bytesAccountedFor >= header.numBytesInRecord-1) {
break;
}
}
header.cols = headers.toArray(new DBFColumnHeader[headers.size()]);
int endOfHeader = is.read();
if (endOfHeader != 13) {
throw new TikaException("Expected new line at end of header");
}
long totalReadSoFar = 32 + (numCols * 32) + 1;
//there can be extra bytes in the header
long extraHeaderBytes = header.numBytesInHeader - totalReadSoFar;
IOUtils.skipFully(is, extraHeaderBytes);
return header;
}
private static DBFColumnHeader readCol(InputStream is) throws IOException, TikaException {
byte[] fieldRecord = new byte[32];
IOUtils.readFully(is, fieldRecord);
DBFColumnHeader col = new DBFColumnHeader();
col.name = new byte[11];
System.arraycopy(fieldRecord, 0, col.name, 0, 10);
int colType = fieldRecord[11] & 0xFF;
if (colType < 0) {
throw new IOException("File truncated before coltype in header");
}
col.setType(colType);
col.fieldLength = fieldRecord[16] & 0xFF;
if (col.fieldLength < 0) {
throw new TikaException("Field length for column "+col.getName(StandardCharsets.US_ASCII)+" is < 0");
} else if (col.fieldLength > DBFReader.MAX_FIELD_LENGTH) {
throw new TikaException("Field length ("+col.fieldLength+") is greater than DBReader.MAX_FIELD_LENGTH ("+
DBFReader.MAX_FIELD_LENGTH+")");
}
col.decimalCount = fieldRecord[17] & 0xFF;
return col;
}
DBFColumnHeader[] getCols() {
return cols;
}
int getNumRecords() {
return numRecords;
}
Calendar getLastModified() {
return lastModified;
}
DBFReader.Version getVersion() {
return version;
}
@Override
public String toString() {
return "DBFFileHeader{" +
"lastModified=" + lastModified +
", numRecords=" + numRecords +
", numBytesInHeader=" + numBytesInHeader +
", numBytesInRecord=" + numBytesInRecord +
", cols=" + Arrays.toString(cols) +
'}';
}
}