gate.corpora.CorpusImpl Maven / Gradle / Ivy

/*
 *  CorpusImpl.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Hamish Cunningham, 11/Feb/2000
 *
 *  $Id: CorpusImpl.java 17604 2014-03-09 10:08:13Z markagreenwood $
 */

package gate.corpora;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractLanguageResource;
import gate.creole.CustomDuplication;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.event.CorpusEvent;
import gate.event.CorpusListener;
import gate.event.CreoleEvent;
import gate.event.CreoleListener;
import gate.event.StatusListener;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.Files;
import gate.util.Strings;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.net.URL;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Vector;

/**
 * Corpora are sets of Document. They are ordered by lexicographic
 * collation on Url.
 */
@CreoleResource(name = "GATE Corpus", comment = "GATE transient corpus.", interfaceName = "gate.Corpus", icon = "corpus-trans", helpURL = "http://gate.ac.uk/userguide/sec:developer:loadlr")
public class CorpusImpl extends AbstractLanguageResource implements Corpus,
                                                        CreoleListener,
                                                        CustomDuplication {

  public CorpusImpl() {
    supportList = Collections.synchronizedList(new VerboseList());
    Gate.getCreoleRegister().addCreoleListener(this);
  }

  /**
   * Gets the names of the documents in this corpus.
   * 
   * @return a {@link List} of Strings representing the names of the
   *         documents in this corpus.
   */
  @Override
  public List getDocumentNames() {
    ArrayList res = new ArrayList(supportList.size());
    for(Object document : supportList) {
      res.add(((Document)document).getName());
    }
    return res;
  }

  /**
   * Gets the name of a document in this corpus.
   * 
   * @param index the index of the document
   * @return a String value representing the name of the document at
   *         index in this corpus.
   */
  @Override
  public String getDocumentName(int index) {
    return supportList.get(index).getName();
  }

  /**
   * This method does not make sense for transient corpora, so it does
   * nothing.
   */
  @Override
  public void unloadDocument(Document doc) {
    return;
  }

  /**
   * The underlying list that holds the documents in this corpus.
   */
  protected List supportList = null;

  /**
   * A proxy list that stores the actual data in an internal list and
   * forwards all operations to that one but it also fires the
   * appropriate corpus events when necessary. It also does some type
   * checking so only Documents are accepted as corpus members.
   */
  protected class VerboseList extends AbstractList implements Serializable {

    private static final long serialVersionUID = 3483062654980468826L;

    VerboseList() {
      data = new ArrayList();
    }

    @Override
    public Document get(int index) {
      return data.get(index);
    }

    @Override
    public int size() {
      return data.size();
    }

    @Override
    public Document set(int index, Document element) {
        Document oldDoc = data.set(index, element);

        // fire the 2 events
        fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
                CorpusEvent.DOCUMENT_REMOVED));
        fireDocumentAdded(new CorpusEvent(CorpusImpl.this, element, index,
                CorpusEvent.DOCUMENT_ADDED));
        return oldDoc;
    }

    @Override
    public void add(int index, Document element) {
        data.add(index, element);

        // fire the event
        fireDocumentAdded(new CorpusEvent(CorpusImpl.this, element,
                index, CorpusEvent.DOCUMENT_ADDED));
    }

    @Override
    public Document remove(int index) {
      Document oldDoc = data.remove(index);

      fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
              CorpusEvent.DOCUMENT_REMOVED));
      return oldDoc;
    }

    /**
     * The List containing the actual data.
     */
    List data;
  }

  /**
   * This method returns true when the document is already loaded in
   * memory
   */
  @Override
  public boolean isDocumentLoaded(int index) {
    return true;
  }

  protected void clearDocList() {
    if(supportList == null) return;
    supportList.clear();
  }

  // List methods
  // java docs will be automatically copied from the List interface.

  @Override
  public int size() {
    return supportList.size();
  }

  @Override
  public boolean isEmpty() {
    return supportList.isEmpty();
  }

  @Override
  public boolean contains(Object o) {
    return supportList.contains(o);
  }

  @Override
  public Iterator iterator() {
    return supportList.iterator();
  }

  @Override
  public Object[] toArray() {
    return supportList.toArray();
  }

  @Override
  public  T[] toArray(T[] a) {
    return supportList.toArray(a);
  }

  @Override
  public boolean add(Document o) {
    return supportList.add(o);
  }

  @Override
  public boolean remove(Object o) {
    return supportList.remove(o);
  }

  @Override
  public boolean containsAll(Collection c) {
    return supportList.containsAll(c);
  }

  @Override
  public boolean addAll(Collection c) {
    return supportList.addAll(c);
  }

  @Override
  public boolean addAll(int index, Collection c) {
    return supportList.addAll(index, c);
  }

  @Override
  public boolean removeAll(Collection c) {
    return supportList.removeAll(c);
  }

  @Override
  public boolean retainAll(Collection c) {
    return supportList.retainAll(c);
  }

  @Override
  public void clear() {
    supportList.clear();
  }

  @Override
  public boolean equals(Object o) {
    if(!(o instanceof CorpusImpl)) return false;

    return supportList.equals(o);
  }

  @Override
  public int hashCode() {
    return supportList.hashCode();
  }

  @Override
  public Document get(int index) {
    return supportList.get(index);
  }

  @Override
  public Document set(int index, Document element) {
    return supportList.set(index, element);
  }

  @Override
  public void add(int index, Document element) {
    supportList.add(index, element);
  }

  @Override
  public Document remove(int index) {
    return supportList.remove(index);
  }

  @Override
  public int indexOf(Object o) {
    return supportList.indexOf(o);
  }

  @Override
  public int lastIndexOf(Object o) {
    return supportList.lastIndexOf(o);
  }

  @Override
  public ListIterator listIterator() {
    return supportList.listIterator();
  }

  @Override
  public ListIterator listIterator(int index) {
    return supportList.listIterator(index);
  }

  @Override
  public List subList(int fromIndex, int toIndex) {
    return supportList.subList(fromIndex, toIndex);
  }

  /** Construction */

  @Override
  public void cleanup() {
    Gate.getCreoleRegister().removeCreoleListener(this);
  }

  /** Initialise this resource, and return it. */
  @Override
  public Resource init() {
    if(documentsList != null && !documentsList.isEmpty()) {
      addAll(documentsList);
    }
    return this;
  } // init()

  /**
   * Fills the provided corpus with documents created on the fly from
   * selected files in a directory. Uses a {@link FileFilter} to select
   * which files will be used and which will be ignored. A simple file
   * filter based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * 
   * @param corpus the corpus to be populated
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   * @throws java.io.IOException if a file doesn't exist
   */
  public static void populate(Corpus corpus, URL directory, FileFilter filter,
          String encoding, boolean recurseDirectories) throws IOException {
    populate(corpus, directory, filter, encoding, null, recurseDirectories);
  }

  /**
   * Fills the provided corpus with documents created on the fly from
   * selected files in a directory. Uses a {@link FileFilter} to select
   * which files will be used and which will be ignored. A simple file
   * filter based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * 
   * @param corpus the corpus to be populated
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   * @throws java.io.IOException if a file doesn't exist
   */
  public static void populate(Corpus corpus, URL directory, FileFilter filter,
          String encoding, String mimeType, boolean recurseDirectories)
          throws IOException {

    // check input
    if(!directory.getProtocol().equalsIgnoreCase("file"))
      throw new IllegalArgumentException(
              "The URL provided is not of type \"file:\"!");

    File dir = Files.fileFromURL(directory);
    if(!dir.exists()) throw new FileNotFoundException(dir.toString());

    if(!dir.isDirectory())
      throw new IllegalArgumentException(dir.getAbsolutePath()
              + " is not a directory!");

    File[] files;
    // populate the corpus
    if(recurseDirectories) {
      files = Files.listFilesRecursively(dir, filter);
    }
    else {
      files = dir.listFiles(filter);
    }

    if(files == null) {
      return;
    }

    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator() {
      @Override
      public int compare(File f1, File f2) {
        return f1.getName().compareTo(f2.getName());
      }
    });

    // create the GATE documents
    for(File file : files) {
      if(file.isDirectory()) {
        continue;
      }
      StatusListener sListener = (StatusListener)Gate.getListeners().get(
              "gate.event.StatusListener");
      if(sListener != null)
        sListener.statusChanged("Reading: " + file.getName());
      String docName = file.getName() + "_" + Gate.genSym();
      FeatureMap params = Factory.newFeatureMap();
      params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
      if(encoding != null)
        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
      if(mimeType != null)
        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);

      try {
        Document doc = (Document)Factory.createResource(DocumentImpl.class
                .getName(), params, null, docName);
        corpus.add(doc);
        if(corpus.getLRPersistenceId() != null) {
          // persistent corpus -> unload the document
          corpus.unloadDocument(doc);
          Factory.deleteResource(doc);
        }
      }
      catch(Throwable t) {
        String nl = Strings.getNl();
        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
                + "  Document name was: " + docName + nl + "  Exception was: "
                + t + nl + nl);
        t.printStackTrace();
      }
      if(sListener != null) sListener.statusChanged(file.getName() + " read");
    }

  }// public static void populate

  /**
   * Fills this corpus with documents created from files in a directory.
   * 
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   */
  @Override
  public void populate(URL directory, FileFilter filter, String encoding,
          boolean recurseDirectories) throws IOException,
          ResourceInstantiationException {
    populate(this, directory, filter, encoding, null, recurseDirectories);
  }

  /**
   * Fills this corpus with documents created from files in a directory.
   * 
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param encoding the encoding to be used for reading the documents
   *@param mimeType the mime type to be used when loading documents. If
   *          null, then the mime type will be detected automatically.
   * 
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   */
  @Override
  public void populate(URL directory, FileFilter filter, String encoding,
          String mimeType, boolean recurseDirectories) throws IOException,
          ResourceInstantiationException {
    populate(this, directory, filter, encoding, mimeType, recurseDirectories);
  }

  /**
   * Fills the provided corpus with documents extracted from the
   * provided trec file.
   * 
   * @param corpus the corpus to be populated.
   * @param singleConcatenatedFile the trec file.
   * @param documentRootElement text between this element (start and
   *          end) is considered for creating a new document.
   * @param encoding the encoding of the trec file.
   * @param numberOfDocumentsToExtract extracts the specified number of
   *          documents from the trecweb file; -1 to indicate all files.
   * @param mimeType the mime type which determines how the document is handled
   * @return total length of populated documents in the corpus in number
   *         of bytes
   * @throws java.io.IOException
   */  
  public static long populate(Corpus corpus, URL singleConcatenatedFile,
      String documentRootElement, String encoding,
      int numberOfDocumentsToExtract, String documentNamePrefix,
      String mimeType, boolean includeRootElement) throws IOException { 
    
    StatusListener sListener = (StatusListener)gate.Gate.getListeners().get("gate.event.StatusListener");
    
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();

    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix
            .trim()
            + "_";

    // we start a new document when we find  and
    // close it when we find 
    BufferedReader br = null;
    try {
      
      if(encoding != null && encoding.trim().length() != 0) {
        br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(),
                encoding, 10485760);
      }
      else {
        br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(),
                10485760);
      }

      // reading line by line
      String line = br.readLine();

      // this is where we store document content
      StringBuilder documentString = new StringBuilder();

      // toggle switch to indicate search for start element
      boolean searchingForStartElement = true;

      // keeping count of number of documents extracted
      int count = 1;

      // length in bytes read so far (to return)
      long lengthInBytes = 0;

      // continue until reached the end of file
      while(line != null) {

        // lowercase the line in order to match documentRootElement in any case
        String lowerCasedLine = line.toLowerCase();

        // if searching for startElement?
        if(searchingForStartElement) {

          // may be its with attributes
          int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");

          // may be no attributes?
          if(index == -1) {
            index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
          }

          // if index <0, we are out of the content boundaries, so simply
          // skip the current line and start reading from the next line
          if(index != -1) {
            // if found, that's the first line
            line = line.substring(index);
            searchingForStartElement = false;
          }
          else {
            line = br.readLine();
          }
        }
        else {

          // now searching for last element
          int index = lowerCasedLine.indexOf("");

          // if not found.. this is the content of a new document
          if(index == -1) {
            documentString.append(line + "\n");
            line = br.readLine();
          }
          else {

            // found.. then end the document
            documentString.append(line.substring(0, index + documentRootElement.length() + 3));

            // getting ready for the next document
            searchingForStartElement = true;

            // here lets create a new document create the doc
            if(sListener != null) sListener.statusChanged("Creating Document Number :" + count);
            
            String docName = documentNamePrefix + count + "_" + Gate.genSym();
            
            String docContent = documentString.toString();
            
            if (!includeRootElement)
              docContent = docContent.substring(docContent.indexOf(">")+1, docContent.lastIndexOf("<"));
            
            FeatureMap params = Factory.newFeatureMap();
            if (mimeType != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);            
            params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
            if(encoding != null && encoding.trim().length() > 0)
              params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding); 
            
            // calculate the length
            lengthInBytes += docContent.getBytes().length;            

            try {
              Document doc = (Document)Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
              count++;
              corpus.add(doc);
              
              if(corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
              }
              
              // already extracted requested num of documents?
              if((count - 1) == numberOfDocumentsToExtract) break;
            }
            catch(Throwable t) {
              String nl = Strings.getNl();
              Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
                  + "  Document name was: " + docName + nl
                  + "  Exception was: " + t + nl + nl);
              t.printStackTrace();
            }
            
            documentString = new StringBuilder();
            if(sListener != null) sListener.statusChanged(docName + " created!");
           
            line = line.substring(index + documentRootElement.length() + 3);
            if (line.trim().equals("")) line = br.readLine();
          }
        }
      }
      return lengthInBytes;
    }
    finally {
      if(br != null) br.close();
    }
  }// public static void populate

  /**
   * Fills the provided corpus with documents extracted from the
   * provided single concatenated file.
   * 
   * @param singleConcatenatedFile the single concatenated file to load.
   * @param documentRootElement content between the start and end of
   *          this element is considered for documents.
   * @param encoding the encoding of the trec file.
   * @param numberOfFilesToExtract indicates the number of files to
   *          extract from the trecweb file.
   * @param documentNamePrefix the prefix to use for document names when
   *          creating from
   * @param mimeType the mime type which determines how the document is handled
   * @return total length of populated documents in the corpus in number
   *         of bytes
   */ 
  @Override
  public long populate(URL singleConcatenatedFile, String documentRootElement,
      String encoding, int numberOfFilesToExtract,
      String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException,
      ResourceInstantiationException {
    return CorpusImpl.populate(this, singleConcatenatedFile,
        documentRootElement, encoding, numberOfFilesToExtract,
        documentNamePrefix, mimeType, includeRootElement);
}

  @Override
  public synchronized void removeCorpusListener(CorpusListener l) {
    if(corpusListeners != null && corpusListeners.contains(l)) {
      @SuppressWarnings("unchecked")
      Vector v = (Vector)corpusListeners.clone();
      v.removeElement(l);
      corpusListeners = v;
    }
  }

  @Override
  public synchronized void addCorpusListener(CorpusListener l) {
    @SuppressWarnings("unchecked")
    Vector v = corpusListeners == null
            ? new Vector(2)
            : (Vector)corpusListeners.clone();
    if(!v.contains(l)) {
      v.addElement(l);
      corpusListeners = v;
    }
  }

  /**
   * Custom duplication for a corpus - duplicate this corpus in the
   * usual way, then duplicate the documents in this corpus and add them
   * to the duplicate.
   */
  @Override
  public Resource duplicate(Factory.DuplicationContext ctx)
          throws ResourceInstantiationException {
    Corpus newCorpus = (Corpus)Factory.defaultDuplicate(this, ctx);
    for(Document d : this) {
      newCorpus.add((Document)Factory.duplicate(d, ctx));
    }
    return newCorpus;
  }

  /** Freeze the serialization UID. */
  static final long serialVersionUID = -1113142759053898456L;

  private transient Vector corpusListeners;

  protected transient List documentsList;

  protected void fireDocumentAdded(CorpusEvent e) {
    if(corpusListeners != null) {
      Vector listeners = corpusListeners;
      int count = listeners.size();
      for(int i = 0; i < count; i++) {
        listeners.elementAt(i).documentAdded(e);
      }
    }
  }

  protected void fireDocumentRemoved(CorpusEvent e) {
    if(corpusListeners != null) {
      Vector listeners = corpusListeners;
      int count = listeners.size();
      for(int i = 0; i < count; i++) {
        listeners.elementAt(i).documentRemoved(e);
      }
    }
  }

  @Optional
  @CreoleParameter(collectionElementType = Document.class, comment = "A list of GATE documents")
  public void setDocumentsList(java.util.List documentsList) {
    this.documentsList = documentsList;
  }

  public java.util.List getDocumentsList() {
    return documentsList;
  }

  @Override
  public void resourceLoaded(CreoleEvent e) {
  }

  @Override
  public void resourceUnloaded(CreoleEvent e) {
    Resource res = e.getResource();
    // remove all occurences
    if(res instanceof Document) while(contains(res))
      remove(res);
  }

  @Override
  public void resourceRenamed(Resource resource, String oldName, String newName) {
  }

  @Override
  public void datastoreOpened(CreoleEvent e) {
  }

  @Override
  public void datastoreCreated(CreoleEvent e) {
  }

  @Override
  public void datastoreClosed(CreoleEvent e) {
  }
} // class CorpusImpl