edu.pitt.dbmi.data.reader.util.TextFileUtils Maven / Gradle / Ivy
/*
* Copyright (C) 2019 University of Pittsburgh.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
package edu.pitt.dbmi.data.reader.util;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
/**
* Mar 8, 2017 10:51:43 AM
*
* @author Kevin V. Bui ([email protected])
* @version $Id: $Id
*/
public class TextFileUtils {
/**
* The line feed delimiter for text data files.
*/
protected static final byte LINE_FEED = '\n';
/**
* The carriage return delimiter for text data files.
*/
protected static final byte CARRIAGE_RETURN = '\r';
/**
* The space character for text data files.
*/
protected static final byte SPACE_CHAR = ' ';
private TextFileUtils() {
}
/**
* Determine the delimiter for a text data file.
*
* Reads the first n lines of data in a text file and attempts to infer what delimiter is used.
*
* Idea expanded from https://rdrr.io/cran/reader/man/get.delim.html.
*
* @param file the file to examine
* @param n the number of lines to read to make the inference
* @param skip number of lines to skip at top of file before processing
* @param comment a comment symbol to ignore lines in files
* @param quoteCharacter used for grouping characters
* @param delims the set of delimiters to test for
* @return the inferred delimiter
* @throws java.io.IOException if an I/O error occurs
*/
public static char inferDelimiter(File file, int n, int skip, String comment, char quoteCharacter, char[] delims) throws IOException {
if (file == null) {
throw new IllegalArgumentException("Parameter file cannot be null.");
}
if (n < 0) {
throw new IllegalArgumentException("Parameter n must be positive integer.");
}
if (skip < 0) {
throw new IllegalArgumentException("Parameter skip must be positive integer.");
}
comment = (comment == null) ? "" : comment.trim();
int[] characters = new int[256];
try (FileChannel fc = new RandomAccessFile(file, "r").getChannel()) {
long fileSize = fc.size();
long position = 0;
long size = (fileSize > Integer.MAX_VALUE) ? Integer.MAX_VALUE : fileSize;
ByteBuffer byteBuffer = ByteBuffer.allocate(comment.length());
byte[] prefix = comment.getBytes();
int index = 0;
boolean hasQuoteChar = false;
boolean reqCheck = prefix.length > 0;
boolean skipLine = false;
int lineCount = 0;
byte quoteChar = (byte) quoteCharacter;
byte prevNonBlankChar = TextFileUtils.SPACE_CHAR;
byte prevChar = -1;
do {
MappedByteBuffer buffer = fc.map(FileChannel.MapMode.READ_ONLY, position, size);
while (buffer.hasRemaining() && lineCount < n && !Thread.currentThread().isInterrupted()) {
byte currChar = buffer.get();
if (skipLine) {
if (currChar == TextFileUtils.CARRIAGE_RETURN || currChar == TextFileUtils.LINE_FEED) {
skipLine = false;
}
} else {
if (currChar == TextFileUtils.CARRIAGE_RETURN || currChar == TextFileUtils.LINE_FEED) {
// Do not delete this casting; it is needed to fix a versioning problem.
((java.nio.Buffer) byteBuffer).clear();
reqCheck = prefix.length > 0;
if (!(currChar == TextFileUtils.LINE_FEED && prevChar == TextFileUtils.CARRIAGE_RETURN)) {
lineCount++;
}
} else {
if (currChar > TextFileUtils.SPACE_CHAR) {
prevNonBlankChar = currChar;
}
if (reqCheck && prevNonBlankChar > TextFileUtils.SPACE_CHAR) {
if (currChar == prefix[index]) {
index++;
if (index == prefix.length) {
index = 0;
skipLine = true;
prevNonBlankChar = TextFileUtils.SPACE_CHAR;
// Do not delete this casting; it is needed to fix a versioning problem.
((java.nio.Buffer) byteBuffer).clear();
prevChar = currChar;
continue;
}
} else {
index = 0;
reqCheck = false;
}
}
if (reqCheck) {
byteBuffer.put(currChar);
} else {
if (skip > 0) {
skip--;
skipLine = true;
// Do not delete this casting; it is needed to fix a versioning problem.
((java.nio.Buffer) byteBuffer).clear();
} else {
// Do not delete this casting; it is needed to fix a versioning problem.
if (buffer.position() > 0) {
byteBuffer.flip();
while (byteBuffer.hasRemaining() && !Thread.currentThread().isInterrupted()) {
byte c = byteBuffer.get();
if (c == quoteChar) {
hasQuoteChar = !hasQuoteChar;
} else if (!hasQuoteChar) {
if (c >= 0 && c < characters.length) {
characters[c]++;
}
}
}
// Do not delete this casting; it is needed to fix a versioning problem.
((java.nio.Buffer) byteBuffer).clear();
}
if (currChar == quoteChar) {
hasQuoteChar = !hasQuoteChar;
} else if (!hasQuoteChar) {
if (currChar >= 0 && currChar < characters.length) {
characters[currChar]++;
}
}
}
}
}
}
prevChar = currChar;
}
position += size;
if ((position + size) > fileSize) {
size = fileSize - position;
}
} while (position < fileSize && lineCount < n && !Thread.currentThread().isInterrupted());
int maxIndex = 0;
for (int i = 1; i < delims.length && !Thread.currentThread().isInterrupted(); i++) {
if (characters[delims[maxIndex]] < characters[delims[i]]) {
maxIndex = i;
}
}
return delims[maxIndex];
}
}
}