gate.corpora.CorpusImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.
The newest version!
/*
 *  CorpusImpl.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Hamish Cunningham, 11/Feb/2000
 *
 *  $Id: CorpusImpl.java 17604 2014-03-09 10:08:13Z markagreenwood $
 */

package gate.corpora;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractLanguageResource;
import gate.creole.CustomDuplication;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.event.CorpusEvent;
import gate.event.CorpusListener;
import gate.event.CreoleEvent;
import gate.event.CreoleListener;
import gate.event.StatusListener;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.Files;
import gate.util.Strings;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.net.URL;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Vector;

/**
 * Corpora are sets of Document. They are ordered by lexicographic
 * collation on Url.
 */
@CreoleResource(name = "GATE Corpus", comment = "GATE transient corpus.", interfaceName = "gate.Corpus", icon = "corpus-trans", helpURL = "http://gate.ac.uk/userguide/sec:developer:loadlr")
public class CorpusImpl extends AbstractLanguageResource implements Corpus,
                                                        CreoleListener,
                                                        CustomDuplication {

  public CorpusImpl() {
    supportList = Collections.synchronizedList(new VerboseList());
    Gate.getCreoleRegister().addCreoleListener(this);
  }

  /**
   * Gets the names of the documents in this corpus.
   * 
   * @return a {@link List} of Strings representing the names of the
   *         documents in this corpus.
   */
  @Override
  public List getDocumentNames() {
    ArrayList res = new ArrayList(supportList.size());
    for(Object document : supportList) {
      res.add(((Document)document).getName());
    }
    return res;
  }

  /**
   * Gets the name of a document in this corpus.
   * 
   * @param index the index of the document
   * @return a String value representing the name of the document at
   *         index in this corpus.
   */
  @Override
  public String getDocumentName(int index) {
    return supportList.get(index).getName();
  }

  /**
   * This method does not make sense for transient corpora, so it does
   * nothing.
   */
  @Override
  public void unloadDocument(Document doc) {
    return;
  }

  /**
   * The underlying list that holds the documents in this corpus.
   */
  protected List supportList = null;

  /**
   * A proxy list that stores the actual data in an internal list and
   * forwards all operations to that one but it also fires the
   * appropriate corpus events when necessary. It also does some type
   * checking so only Documents are accepted as corpus members.
   */
  protected class VerboseList extends AbstractList implements Serializable {

    private static final long serialVersionUID = 3483062654980468826L;

    VerboseList() {
      data = new ArrayList();
    }

    @Override
    public Document get(int index) {
      return data.get(index);
    }

    @Override
    public int size() {
      return data.size();
    }

    @Override
    public Document set(int index, Document element) {
        Document oldDoc = data.set(index, element);

        // fire the 2 events
        fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
                CorpusEvent.DOCUMENT_REMOVED));
        fireDocumentAdded(new CorpusEvent(CorpusImpl.this, element, index,
                CorpusEvent.DOCUMENT_ADDED));
        return oldDoc;
    }

    @Override
    public void add(int index, Document element) {
        data.add(index, element);

        // fire the event
        fireDocumentAdded(new CorpusEvent(CorpusImpl.this, element,
                index, CorpusEvent.DOCUMENT_ADDED));
    }

    @Override
    public Document remove(int index) {
      Document oldDoc = data.remove(index);

      fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
              CorpusEvent.DOCUMENT_REMOVED));
      return oldDoc;
    }

    /**
     * The List containing the actual data.
     */
    List data;
  }

  /**
   * This method returns true when the document is already loaded in
   * memory
   */
  @Override
  public boolean isDocumentLoaded(int index) {
    return true;
  }

  protected void clearDocList() {
    if(supportList == null) return;
    supportList.clear();
  }

  // List methods
  // java docs will be automatically copied from the List interface.

  @Override
  public int size() {
    return supportList.size();
  }

  @Override
  public boolean isEmpty() {
    return supportList.isEmpty();
  }

  @Override
  public boolean contains(Object o) {
    return supportList.contains(o);
  }

  @Override
  public Iterator iterator() {
    return supportList.iterator();
  }

  @Override
  public Object[] toArray() {
    return supportList.toArray();
  }

  @Override
  public  T[] toArray(T[] a) {
    return supportList.toArray(a);
  }

  @Override
  public boolean add(Document o) {
    return supportList.add(o);
  }

  @Override
  public boolean remove(Object o) {
    return supportList.remove(o);
  }

  @Override
  public boolean containsAll(Collection c) {
    return supportList.containsAll(c);
  }

  @Override
  public boolean addAll(Collection c) {
    return supportList.addAll(c);
  }

  @Override
  public boolean addAll(int index, Collection c) {
    return supportList.addAll(index, c);
  }

  @Override
  public boolean removeAll(Collection c) {
    return supportList.removeAll(c);
  }

  @Override
  public boolean retainAll(Collection c) {
    return supportList.retainAll(c);
  }

  @Override
  public void clear() {
    supportList.clear();
  }

  @Override
  public boolean equals(Object o) {
    if(!(o instanceof CorpusImpl)) return false;

    return supportList.equals(o);
  }

  @Override
  public int hashCode() {
    return supportList.hashCode();
  }

  @Override
  public Document get(int index) {
    return supportList.get(index);
  }

  @Override
  public Document set(int index, Document element) {
    return supportList.set(index, element);
  }

  @Override
  public void add(int index, Document element) {
    supportList.add(index, element);
  }

  @Override
  public Document remove(int index) {
    return supportList.remove(index);
  }

  @Override
  public int indexOf(Object o) {
    return supportList.indexOf(o);
  }

  @Override
  public int lastIndexOf(Object o) {
    return supportList.lastIndexOf(o);
  }

  @Override
  public ListIterator listIterator() {
    return supportList.listIterator();
  }

  @Override
  public ListIterator listIterator(int index) {
    return supportList.listIterator(index);
  }

  @Override
  public List subList(int fromIndex, int toIndex) {
    return supportList.subList(fromIndex, toIndex);
  }

  /** Construction */

  @Override
  public void cleanup() {
    Gate.getCreoleRegister().removeCreoleListener(this);
  }

  /** Initialise this resource, and return it. */
  @Override
  public Resource init() {
    if(documentsList != null && !documentsList.isEmpty()) {
      addAll(documentsList);
    }
    return this;
  } // init()

  /**
   * Fills the provided corpus with documents created on the fly from
   * selected files in a directory. Uses a {@link FileFilter} to select
   * which files will be used and which will be ignored. A simple file
   * filter based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * 
   * @param corpus the corpus to be populated
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   * @throws java.io.IOException if a file doesn't exist
   */
  public static void populate(Corpus corpus, URL directory, FileFilter filter,
          String encoding, boolean recurseDirectories) throws IOException {
    populate(corpus, directory, filter, encoding, null, recurseDirectories);
  }

  /**
   * Fills the provided corpus with documents created on the fly from
   * selected files in a directory. Uses a {@link FileFilter} to select
   * which files will be used and which will be ignored. A simple file
   * filter based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * 
   * @param corpus the corpus to be populated
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   * @throws java.io.IOException if a file doesn't exist
   */
  public static void populate(Corpus corpus, URL directory, FileFilter filter,
          String encoding, String mimeType, boolean recurseDirectories)
          throws IOException {

    // check input
    if(!directory.getProtocol().equalsIgnoreCase("file"))
      throw new IllegalArgumentException(
              "The URL provided is not of type \"file:\"!");

    File dir = Files.fileFromURL(directory);
    if(!dir.exists()) throw new FileNotFoundException(dir.toString());

    if(!dir.isDirectory())
      throw new IllegalArgumentException(dir.getAbsolutePath()
              + " is not a directory!");

    File[] files;
    // populate the corpus
    if(recurseDirectories) {
      files = Files.listFilesRecursively(dir, filter);
    }
    else {
      files = dir.listFiles(filter);
    }

    if(files == null) {
      return;
    }

    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator() {
      @Override
      public int compare(File f1, File f2) {
        return f1.getName().compareTo(f2.getName());
      }
    });

    // create the GATE documents
    for(File file : files) {
      if(file.isDirectory()) {
        continue;
      }
      StatusListener sListener = (StatusListener)Gate.getListeners().get(
              "gate.event.StatusListener");
      if(sListener != null)
        sListener.statusChanged("Reading: " + file.getName());
      String docName = file.getName() + "_" + Gate.genSym();
      FeatureMap params = Factory.newFeatureMap();
      params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
      if(encoding != null)
        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
      if(mimeType != null)
        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);

      try {
        Document doc = (Document)Factory.createResource(DocumentImpl.class
                .getName(), params, null, docName);
        corpus.add(doc);
        if(corpus.getLRPersistenceId() != null) {
          // persistent corpus -> unload the document
          corpus.unloadDocument(doc);
          Factory.deleteResource(doc);
        }
      }
      catch(Throwable t) {
        String nl = Strings.getNl();
        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
                + "  Document name was: " + docName + nl + "  Exception was: "
                + t + nl + nl);
        t.printStackTrace();
      }
      if(sListener != null) sListener.statusChanged(file.getName() + " read");
    }

  }// public static void populate

  /**
   * Fills this corpus with documents created from files in a directory.
   * 
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   */
  @Override
  public void populate(URL directory, FileFilter filter, String encoding,
          boolean recurseDirectories) throws IOException,
          ResourceInstantiationException {
    populate(this, directory, filter, encoding, null, recurseDirectories);
  }

  /**
   * Fills this corpus with documents created from files in a directory.
   * 
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param encoding the encoding to be used for reading the documents
   *@param mimeType the mime type to be used when loading documents. If
   *          null, then the mime type will be detected automatically.
   * 
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   */
  @Override
  public void populate(URL directory, FileFilter filter, String encoding,
          String mimeType, boolean recurseDirectories) throws IOException,
          ResourceInstantiationException {
    populate(this, directory, filter, encoding, mimeType, recurseDirectories);
  }

  /**
   * Fills the provided corpus with documents extracted from the
   * provided trec file.
   * 
   * @param corpus the corpus to be populated.
   * @param singleConcatenatedFile the trec file.
   * @param documentRootElement text between this element (start and
   *          end) is considered for creating a new document.
   * @param encoding the encoding of the trec file.
   * @param numberOfDocumentsToExtract extracts the specified number of
   *          documents from the trecweb file; -1 to indicate all files.
   * @param mimeType the mime type which determines how the document is handled
   * @return total length of populated documents in the corpus in number
   *         of bytes
   * @throws java.io.IOException
   */  
  public static long populate(Corpus corpus, URL singleConcatenatedFile,
      String documentRootElement, String encoding,
      int numberOfDocumentsToExtract, String documentNamePrefix,
      String mimeType, boolean includeRootElement) throws IOException { 
    
    StatusListener sListener = (StatusListener)gate.Gate.getListeners().get("gate.event.StatusListener");
    
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();

    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix
            .trim()
            + "_";

    // we start a new document when we find  and
    // close it when we find 
    BufferedReader br = null;
    try {
      
      if(encoding != null && encoding.trim().length() != 0) {
        br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(),
                encoding, 10485760);
      }
      else {
        br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(),
                10485760);
      }

      // reading line by line
      String line = br.readLine();

      // this is where we store document content
      StringBuilder documentString = new StringBuilder();

      // toggle switch to indicate search for start element
      boolean searchingForStartElement = true;

      // keeping count of number of documents extracted
      int count = 1;

      // length in bytes read so far (to return)
      long lengthInBytes = 0;

      // continue until reached the end of file
      while(line != null) {

        // lowercase the line in order to match documentRootElement in any case
        String lowerCasedLine = line.toLowerCase();

        // if searching for startElement?
        if(searchingForStartElement) {

          // may be its with attributes
          int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");

          // may be no attributes?
          if(index == -1) {
            index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
          }

          // if index <0, we are out of the content boundaries, so simply
          // skip the current line and start reading from the next line
          if(index != -1) {
            // if found, that's the first line
            line = line.substring(index);
            searchingForStartElement = false;
          }
          else {
            line = br.readLine();
          }
        }
        else {

          // now searching for last element
          int index = lowerCasedLine.indexOf("");

          // if not found.. this is the content of a new document
          if(index == -1) {
            documentString.append(line + "\n");
            line = br.readLine();
          }
          else {

            // found.. then end the document
            documentString.append(line.substring(0, index + documentRootElement.length() + 3));

            // getting ready for the next document
            searchingForStartElement = true;

            // here lets create a new document create the doc
            if(sListener != null) sListener.statusChanged("Creating Document Number :" + count);
            
            String docName = documentNamePrefix + count + "_" + Gate.genSym();
            
            String docContent = documentString.toString();
            
            if (!includeRootElement)
              docContent = docContent.substring(docContent.indexOf(">")+1, docContent.lastIndexOf("<"));
            
            FeatureMap params = Factory.newFeatureMap();
            if (mimeType != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);            
            params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
            if(encoding != null && encoding.trim().length() > 0)
              params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding); 
            
            // calculate the length
            lengthInBytes += docContent.getBytes().length;            

            try {
              Document doc = (Document)Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
              count++;
              corpus.add(doc);
              
              if(corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
              }
              
              // already extracted requested num of documents?
              if((count - 1) == numberOfDocumentsToExtract) break;
            }
            catch(Throwable t) {
              String nl = Strings.getNl();
              Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
                  + "  Document name was: " + docName + nl
                  + "  Exception was: " + t + nl + nl);
              t.printStackTrace();
            }
            
            documentString = new StringBuilder();
            if(sListener != null) sListener.statusChanged(docName + " created!");
           
            line = line.substring(index + documentRootElement.length() + 3);
            if (line.trim().equals("")) line = br.readLine();
          }
        }
      }
      return lengthInBytes;
    }
    finally {
      if(br != null) br.close();
    }
  }// public static void populate

  /**
   * Fills the provided corpus with documents extracted from the
   * provided single concatenated file.
   * 
   * @param singleConcatenatedFile the single concatenated file to load.
   * @param documentRootElement content between the start and end of
   *          this element is considered for documents.
   * @param encoding the encoding of the trec file.
   * @param numberOfFilesToExtract indicates the number of files to
   *          extract from the trecweb file.
   * @param documentNamePrefix the prefix to use for document names when
   *          creating from
   * @param mimeType the mime type which determines how the document is handled
   * @return total length of populated documents in the corpus in number
   *         of bytes
   */ 
  @Override
  public long populate(URL singleConcatenatedFile, String documentRootElement,
      String encoding, int numberOfFilesToExtract,
      String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException,
      ResourceInstantiationException {
    return CorpusImpl.populate(this, singleConcatenatedFile,
        documentRootElement, encoding, numberOfFilesToExtract,
        documentNamePrefix, mimeType, includeRootElement);
}

  @Override
  public synchronized void removeCorpusListener(CorpusListener l) {
    if(corpusListeners != null && corpusListeners.contains(l)) {
      @SuppressWarnings("unchecked")
      Vector v = (Vector)corpusListeners.clone();
      v.removeElement(l);
      corpusListeners = v;
    }
  }

  @Override
  public synchronized void addCorpusListener(CorpusListener l) {
    @SuppressWarnings("unchecked")
    Vector v = corpusListeners == null
            ? new Vector(2)
            : (Vector)corpusListeners.clone();
    if(!v.contains(l)) {
      v.addElement(l);
      corpusListeners = v;
    }
  }

  /**
   * Custom duplication for a corpus - duplicate this corpus in the
   * usual way, then duplicate the documents in this corpus and add them
   * to the duplicate.
   */
  @Override
  public Resource duplicate(Factory.DuplicationContext ctx)
          throws ResourceInstantiationException {
    Corpus newCorpus = (Corpus)Factory.defaultDuplicate(this, ctx);
    for(Document d : this) {
      newCorpus.add((Document)Factory.duplicate(d, ctx));
    }
    return newCorpus;
  }

  /** Freeze the serialization UID. */
  static final long serialVersionUID = -1113142759053898456L;

  private transient Vector corpusListeners;

  protected transient List documentsList;

  protected void fireDocumentAdded(CorpusEvent e) {
    if(corpusListeners != null) {
      Vector listeners = corpusListeners;
      int count = listeners.size();
      for(int i = 0; i < count; i++) {
        listeners.elementAt(i).documentAdded(e);
      }
    }
  }

  protected void fireDocumentRemoved(CorpusEvent e) {
    if(corpusListeners != null) {
      Vector listeners = corpusListeners;
      int count = listeners.size();
      for(int i = 0; i < count; i++) {
        listeners.elementAt(i).documentRemoved(e);
      }
    }
  }

  @Optional
  @CreoleParameter(collectionElementType = Document.class, comment = "A list of GATE documents")
  public void setDocumentsList(java.util.List documentsList) {
    this.documentsList = documentsList;
  }

  public java.util.List getDocumentsList() {
    return documentsList;
  }

  @Override
  public void resourceLoaded(CreoleEvent e) {
  }

  @Override
  public void resourceUnloaded(CreoleEvent e) {
    Resource res = e.getResource();
    // remove all occurences
    if(res instanceof Document) while(contains(res))
      remove(res);
  }

  @Override
  public void resourceRenamed(Resource resource, String oldName, String newName) {
  }

  @Override
  public void datastoreOpened(CreoleEvent e) {
  }

  @Override
  public void datastoreCreated(CreoleEvent e) {
  }

  @Override
  public void datastoreClosed(CreoleEvent e) {
  }
} // class CorpusImpl