edu.isi.nlp.io.ConcatenatedXMLIterableFactory Maven / Gradle / Ivy
package edu.isi.nlp.io;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.annotations.Beta;
import com.google.common.base.Function;
import com.google.common.collect.AbstractIterator;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
@Beta
/**
* Provides a way of iterating over multiple XML files concatenated together. When processing all
* files in a very large corpus, it is often significantly more efficient to do this.
*
* WARNING: The current implementation is limited and not generally correct. It is intended to be
* used only on XML documents output by BBN Serif.
*/
public final class ConcatenatedXMLIterableFactory {
private final String splitString;
private final int maxDocBytes;
private static final int MEGABYTES = 1024 * 1024;
private ConcatenatedXMLIterableFactory(String splitPattern, int maxDocBytes) {
this.splitString = checkNotNull(splitPattern);
checkArgument(maxDocBytes > 0);
this.maxDocBytes = maxDocBytes;
}
public static ConcatenatedXMLIterableFactory splitOnXMLProlog() {
return new ConcatenatedXMLIterableFactory(" filesIn(CharSource source) {
return new ConcatenatedXMLFile(source);
}
public Function> asFunction(final Charset charset) {
return new Function>() {
@Override
public Iterable apply(final File input) {
return filesIn(Files.asCharSource(input, charset));
}
};
}
/**
* Since we can't declare an {@link java.io.IOException} on the {@link #iterator()} method, be
* aware that any exceptions during reading will be wrapped in a {@link ConcatenatedXMLException}.
*/
private final class ConcatenatedXMLFile implements Iterable {
private final CharSource source;
public ConcatenatedXMLFile(CharSource source) {
this.source = checkNotNull(source);
}
@Override
public Iterator iterator() {
try {
return new ConcatenatedXMLIterator(source.openBufferedStream());
} catch (IOException ioe) {
throw new ConcatenatedXMLException(ioe);
}
}
}
public static class ConcatenatedXMLException extends RuntimeException {
private final Exception wrapped;
public ConcatenatedXMLException(Exception wrapped) {
super(wrapped);
this.wrapped = checkNotNull(wrapped);
}
public Exception getWrappedException() {
return wrapped;
}
}
private class ConcatenatedXMLIterator extends AbstractIterator {
private final BufferedReader reader;
private boolean first = true;
public ConcatenatedXMLIterator(BufferedReader bufferedReader) {
this.reader = checkNotNull(bufferedReader);
}
@Override
protected CharSource computeNext() {
try {
final String firstLine = reader.readLine();
// no more data - we're done
if (firstLine == null) {
return endOfData();
}
// the reader should always be starting with the split string
// note this means the file being split must begin with the split string
if (!firstLine.startsWith(splitString)) {
throw new ConcatenatedXMLException(
new IOException("Block does not start with split string " + splitString));
}
final StringBuilder data = new StringBuilder();
String line;
// we always mark our position before beginning a new line
// so that if we find the beginning of the next block, we can back up
reader.mark(maxDocBytes);
while ((line = reader.readLine()) != null) {
// beginning of next block. Back up and return what we've accumulated
if (line.startsWith(splitString)) {
reader.reset();
break;
} else {
// not done yet
data.append(line).append("\n");
reader.mark(maxDocBytes);
}
}
return CharSource.wrap(data.toString());
} catch (IOException ioe) {
throw new ConcatenatedXMLException(ioe);
}
}
}
}