edu.isi.nlp.io.ConcatenatedXMLIterableFactory Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of common-core-open Show documentation
The newest version!
package edu.isi.nlp.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.annotations.Beta;
import com.google.common.base.Function;
import com.google.common.collect.AbstractIterator;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;

@Beta
/**
 * Provides a way of iterating over multiple XML files concatenated together. When processing all
 * files in a very large corpus, it is often significantly more efficient to do this.
 *
 * WARNING: The current implementation is limited and not generally correct. It is intended to be
 * used only on XML documents output by BBN Serif.
 */
public final class ConcatenatedXMLIterableFactory {

  private final String splitString;
  private final int maxDocBytes;

  private static final int MEGABYTES = 1024 * 1024;

  private ConcatenatedXMLIterableFactory(String splitPattern, int maxDocBytes) {
    this.splitString = checkNotNull(splitPattern);
    checkArgument(maxDocBytes > 0);
    this.maxDocBytes = maxDocBytes;
  }

  public static ConcatenatedXMLIterableFactory splitOnXMLProlog() {
    return new ConcatenatedXMLIterableFactory(" filesIn(CharSource source) {
    return new ConcatenatedXMLFile(source);
  }

  public Function> asFunction(final Charset charset) {
    return new Function>() {
      @Override
      public Iterable apply(final File input) {
        return filesIn(Files.asCharSource(input, charset));
      }
    };
  }

  /**
   * Since we can't declare an {@link java.io.IOException} on the {@link #iterator()} method, be
   * aware that any exceptions during reading will be wrapped in a {@link ConcatenatedXMLException}.
   */
  private final class ConcatenatedXMLFile implements Iterable {

    private final CharSource source;

    public ConcatenatedXMLFile(CharSource source) {
      this.source = checkNotNull(source);
    }

    @Override
    public Iterator iterator() {
      try {
        return new ConcatenatedXMLIterator(source.openBufferedStream());
      } catch (IOException ioe) {
        throw new ConcatenatedXMLException(ioe);
      }
    }
  }

  public static class ConcatenatedXMLException extends RuntimeException {

    private final Exception wrapped;

    public ConcatenatedXMLException(Exception wrapped) {
      super(wrapped);
      this.wrapped = checkNotNull(wrapped);
    }

    public Exception getWrappedException() {
      return wrapped;
    }
  }

  private class ConcatenatedXMLIterator extends AbstractIterator {

    private final BufferedReader reader;
    private boolean first = true;

    public ConcatenatedXMLIterator(BufferedReader bufferedReader) {
      this.reader = checkNotNull(bufferedReader);
    }

    @Override
    protected CharSource computeNext() {
      try {
        final String firstLine = reader.readLine();
        // no more data - we're done
        if (firstLine == null) {
          return endOfData();
        }

        // the reader should always be starting with the split string
        // note this means the file being split must begin with the split string
        if (!firstLine.startsWith(splitString)) {
          throw new ConcatenatedXMLException(
              new IOException("Block does not start with split string " + splitString));
        }

        final StringBuilder data = new StringBuilder();

        String line;
        // we always mark our position before beginning a new line
        // so that if we find the beginning of the next block, we can back up
        reader.mark(maxDocBytes);
        while ((line = reader.readLine()) != null) {
          // beginning of next block. Back up and return what we've accumulated
          if (line.startsWith(splitString)) {
            reader.reset();
            break;
          } else {
            // not done yet
            data.append(line).append("\n");
            reader.mark(maxDocBytes);
          }
        }

        return CharSource.wrap(data.toString());
      } catch (IOException ioe) {
        throw new ConcatenatedXMLException(ioe);
      }
    }
  }
}