All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.corpora.CorpusImpl Maven / Gradle / Ivy

 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at
 *  This file is part of GATE (see, and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at
 *  Hamish Cunningham, 11/Feb/2000
 *  $Id: 17604 2014-03-09 10:08:13Z markagreenwood $

package gate.corpora;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractLanguageResource;
import gate.creole.CustomDuplication;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.event.CorpusEvent;
import gate.event.CorpusListener;
import gate.event.CreoleEvent;
import gate.event.CreoleListener;
import gate.event.StatusListener;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.Files;
import gate.util.Strings;

import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Vector;

 * Corpora are sets of Document. They are ordered by lexicographic
 * collation on Url.
@CreoleResource(name = "GATE Corpus", comment = "GATE transient corpus.", interfaceName = "gate.Corpus", icon = "corpus-trans", helpURL = "")
public class CorpusImpl extends AbstractLanguageResource implements Corpus,
                                                        CustomDuplication {

  public CorpusImpl() {
    supportList = Collections.synchronizedList(new VerboseList());

   * Gets the names of the documents in this corpus.
   * @return a {@link List} of Strings representing the names of the
   *         documents in this corpus.
  public List getDocumentNames() {
    ArrayList res = new ArrayList(supportList.size());
    for(Object document : supportList) {
    return res;

   * Gets the name of a document in this corpus.
   * @param index the index of the document
   * @return a String value representing the name of the document at
   *         index in this corpus.
  public String getDocumentName(int index) {
    return supportList.get(index).getName();

   * This method does not make sense for transient corpora, so it does
   * nothing.
  public void unloadDocument(Document doc) {

   * The underlying list that holds the documents in this corpus.
  protected List supportList = null;

   * A proxy list that stores the actual data in an internal list and
   * forwards all operations to that one but it also fires the
   * appropriate corpus events when necessary. It also does some type
   * checking so only Documents are accepted as corpus members.
  protected class VerboseList extends AbstractList implements Serializable {

    private static final long serialVersionUID = 3483062654980468826L;

    VerboseList() {
      data = new ArrayList();

    public Document get(int index) {
      return data.get(index);

    public int size() {
      return data.size();

    public Document set(int index, Document element) {
        Document oldDoc = data.set(index, element);

        // fire the 2 events
        fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
        fireDocumentAdded(new CorpusEvent(CorpusImpl.this, element, index,
        return oldDoc;

    public void add(int index, Document element) {
        data.add(index, element);

        // fire the event
        fireDocumentAdded(new CorpusEvent(CorpusImpl.this, element,
                index, CorpusEvent.DOCUMENT_ADDED));

    public Document remove(int index) {
      Document oldDoc = data.remove(index);

      fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
      return oldDoc;

     * The List containing the actual data.
    List data;

   * This method returns true when the document is already loaded in
   * memory
  public boolean isDocumentLoaded(int index) {
    return true;

  protected void clearDocList() {
    if(supportList == null) return;

  // List methods
  // java docs will be automatically copied from the List interface.

  public int size() {
    return supportList.size();

  public boolean isEmpty() {
    return supportList.isEmpty();

  public boolean contains(Object o) {
    return supportList.contains(o);

  public Iterator iterator() {
    return supportList.iterator();

  public Object[] toArray() {
    return supportList.toArray();

  public  T[] toArray(T[] a) {
    return supportList.toArray(a);

  public boolean add(Document o) {
    return supportList.add(o);

  public boolean remove(Object o) {
    return supportList.remove(o);

  public boolean containsAll(Collection c) {
    return supportList.containsAll(c);

  public boolean addAll(Collection c) {
    return supportList.addAll(c);

  public boolean addAll(int index, Collection c) {
    return supportList.addAll(index, c);

  public boolean removeAll(Collection c) {
    return supportList.removeAll(c);

  public boolean retainAll(Collection c) {
    return supportList.retainAll(c);

  public void clear() {

  public boolean equals(Object o) {
    if(!(o instanceof CorpusImpl)) return false;

    return supportList.equals(o);

  public int hashCode() {
    return supportList.hashCode();

  public Document get(int index) {
    return supportList.get(index);

  public Document set(int index, Document element) {
    return supportList.set(index, element);

  public void add(int index, Document element) {
    supportList.add(index, element);

  public Document remove(int index) {
    return supportList.remove(index);

  public int indexOf(Object o) {
    return supportList.indexOf(o);

  public int lastIndexOf(Object o) {
    return supportList.lastIndexOf(o);

  public ListIterator listIterator() {
    return supportList.listIterator();

  public ListIterator listIterator(int index) {
    return supportList.listIterator(index);

  public List subList(int fromIndex, int toIndex) {
    return supportList.subList(fromIndex, toIndex);

  /** Construction */

  public void cleanup() {

  /** Initialise this resource, and return it. */
  public Resource init() {
    if(documentsList != null && !documentsList.isEmpty()) {
    return this;
  } // init()

   * Fills the provided corpus with documents created on the fly from
   * selected files in a directory. Uses a {@link FileFilter} to select
   * which files will be used and which will be ignored. A simple file
   * filter based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * @param corpus the corpus to be populated
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   * @throws if a file doesn't exist
  public static void populate(Corpus corpus, URL directory, FileFilter filter,
          String encoding, boolean recurseDirectories) throws IOException {
    populate(corpus, directory, filter, encoding, null, recurseDirectories);

   * Fills the provided corpus with documents created on the fly from
   * selected files in a directory. Uses a {@link FileFilter} to select
   * which files will be used and which will be ignored. A simple file
   * filter based on extensions is provided in the Gate distribution (
   * {@link gate.util.ExtensionFileFilter}).
   * @param corpus the corpus to be populated
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
   * @throws if a file doesn't exist
  public static void populate(Corpus corpus, URL directory, FileFilter filter,
          String encoding, String mimeType, boolean recurseDirectories)
          throws IOException {

    // check input
      throw new IllegalArgumentException(
              "The URL provided is not of type \"file:\"!");

    File dir = Files.fileFromURL(directory);
    if(!dir.exists()) throw new FileNotFoundException(dir.toString());

      throw new IllegalArgumentException(dir.getAbsolutePath()
              + " is not a directory!");

    File[] files;
    // populate the corpus
    if(recurseDirectories) {
      files = Files.listFilesRecursively(dir, filter);
    else {
      files = dir.listFiles(filter);

    if(files == null) {

    // sort the files alphabetically regardless of their paths
    Arrays.sort(files, new Comparator() {
      public int compare(File f1, File f2) {
        return f1.getName().compareTo(f2.getName());

    // create the GATE documents
    for(File file : files) {
      if(file.isDirectory()) {
      StatusListener sListener = (StatusListener)Gate.getListeners().get(
      if(sListener != null)
        sListener.statusChanged("Reading: " + file.getName());
      String docName = file.getName() + "_" + Gate.genSym();
      FeatureMap params = Factory.newFeatureMap();
      params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
      if(encoding != null)
        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
      if(mimeType != null)
        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);

      try {
        Document doc = (Document)Factory.createResource(DocumentImpl.class
                .getName(), params, null, docName);
        if(corpus.getLRPersistenceId() != null) {
          // persistent corpus -> unload the document
      catch(Throwable t) {
        String nl = Strings.getNl();
        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
                + "  Document name was: " + docName + nl + "  Exception was: "
                + t + nl + nl);
      if(sListener != null) sListener.statusChanged(file.getName() + " read");

  }// public static void populate

   * Fills this corpus with documents created from files in a directory.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param encoding the encoding to be used for reading the documents
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
  public void populate(URL directory, FileFilter filter, String encoding,
          boolean recurseDirectories) throws IOException,
          ResourceInstantiationException {
    populate(this, directory, filter, encoding, null, recurseDirectories);

   * Fills this corpus with documents created from files in a directory.
   * @param filter the file filter used to select files from the target
   *          directory. If the filter is null all the files
   *          will be accepted.
   * @param directory the directory from which the files will be picked.
   *          This parameter is an URL for uniformity. It needs to be a
   *          URL of type file otherwise an InvalidArgumentException
   *          will be thrown. An implementation for this method is
   *          provided as a static method at
   *          {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
   *          .
   * @param encoding the encoding to be used for reading the documents
   *@param mimeType the mime type to be used when loading documents. If
   *          null, then the mime type will be detected automatically.
   * @param recurseDirectories should the directory be parsed
   *          recursively?. If true all the files from the
   *          provided directory and all its children directories (on as
   *          many levels as necessary) will be picked if accepted by
   *          the filter otherwise the children directories will be
   *          ignored.
  public void populate(URL directory, FileFilter filter, String encoding,
          String mimeType, boolean recurseDirectories) throws IOException,
          ResourceInstantiationException {
    populate(this, directory, filter, encoding, mimeType, recurseDirectories);

   * Fills the provided corpus with documents extracted from the
   * provided trec file.
   * @param corpus the corpus to be populated.
   * @param singleConcatenatedFile the trec file.
   * @param documentRootElement text between this element (start and
   *          end) is considered for creating a new document.
   * @param encoding the encoding of the trec file.
   * @param numberOfDocumentsToExtract extracts the specified number of
   *          documents from the trecweb file; -1 to indicate all files.
   * @param mimeType the mime type which determines how the document is handled
   * @return total length of populated documents in the corpus in number
   *         of bytes
   * @throws
  public static long populate(Corpus corpus, URL singleConcatenatedFile,
      String documentRootElement, String encoding,
      int numberOfDocumentsToExtract, String documentNamePrefix,
      String mimeType, boolean includeRootElement) throws IOException { 
    StatusListener sListener = (StatusListener)gate.Gate.getListeners().get("gate.event.StatusListener");
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();

    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix
            + "_";

    // we start a new document when we find  and
    // close it when we find 
    BufferedReader br = null;
    try {
      if(encoding != null && encoding.trim().length() != 0) {
        br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(),
                encoding, 10485760);
      else {
        br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(),

      // reading line by line
      String line = br.readLine();

      // this is where we store document content
      StringBuilder documentString = new StringBuilder();

      // toggle switch to indicate search for start element
      boolean searchingForStartElement = true;

      // keeping count of number of documents extracted
      int count = 1;

      // length in bytes read so far (to return)
      long lengthInBytes = 0;

      // continue until reached the end of file
      while(line != null) {

        // lowercase the line in order to match documentRootElement in any case
        String lowerCasedLine = line.toLowerCase();

        // if searching for startElement?
        if(searchingForStartElement) {

          // may be its with attributes
          int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");

          // may be no attributes?
          if(index == -1) {
            index = lowerCasedLine.indexOf("<" + documentRootElement + ">");

          // if index <0, we are out of the content boundaries, so simply
          // skip the current line and start reading from the next line
          if(index != -1) {
            // if found, that's the first line
            line = line.substring(index);
            searchingForStartElement = false;
          else {
            line = br.readLine();
        else {

          // now searching for last element
          int index = lowerCasedLine.indexOf("");

          // if not found.. this is the content of a new document
          if(index == -1) {
            documentString.append(line + "\n");
            line = br.readLine();
          else {

            // found.. then end the document
            documentString.append(line.substring(0, index + documentRootElement.length() + 3));

            // getting ready for the next document
            searchingForStartElement = true;

            // here lets create a new document create the doc
            if(sListener != null) sListener.statusChanged("Creating Document Number :" + count);
            String docName = documentNamePrefix + count + "_" + Gate.genSym();
            String docContent = documentString.toString();
            if (!includeRootElement)
              docContent = docContent.substring(docContent.indexOf(">")+1, docContent.lastIndexOf("<"));
            FeatureMap params = Factory.newFeatureMap();
            if (mimeType != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);            
            params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
            if(encoding != null && encoding.trim().length() > 0)
              params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding); 
            // calculate the length
            lengthInBytes += docContent.getBytes().length;            

            try {
              Document doc = (Document)Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
              if(corpus.getLRPersistenceId() != null) {
                // persistent corpus -> unload the document
              // already extracted requested num of documents?
              if((count - 1) == numberOfDocumentsToExtract) break;
            catch(Throwable t) {
              String nl = Strings.getNl();
              Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
                  + "  Document name was: " + docName + nl
                  + "  Exception was: " + t + nl + nl);
            documentString = new StringBuilder();
            if(sListener != null) sListener.statusChanged(docName + " created!");
            line = line.substring(index + documentRootElement.length() + 3);
            if (line.trim().equals("")) line = br.readLine();
      return lengthInBytes;
    finally {
      if(br != null) br.close();
  }// public static void populate

   * Fills the provided corpus with documents extracted from the
   * provided single concatenated file.
   * @param singleConcatenatedFile the single concatenated file to load.
   * @param documentRootElement content between the start and end of
   *          this element is considered for documents.
   * @param encoding the encoding of the trec file.
   * @param numberOfFilesToExtract indicates the number of files to
   *          extract from the trecweb file.
   * @param documentNamePrefix the prefix to use for document names when
   *          creating from
   * @param mimeType the mime type which determines how the document is handled
   * @return total length of populated documents in the corpus in number
   *         of bytes
  public long populate(URL singleConcatenatedFile, String documentRootElement,
      String encoding, int numberOfFilesToExtract,
      String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException,
      ResourceInstantiationException {
    return CorpusImpl.populate(this, singleConcatenatedFile,
        documentRootElement, encoding, numberOfFilesToExtract,
        documentNamePrefix, mimeType, includeRootElement);

  public synchronized void removeCorpusListener(CorpusListener l) {
    if(corpusListeners != null && corpusListeners.contains(l)) {
      Vector v = (Vector)corpusListeners.clone();
      corpusListeners = v;

  public synchronized void addCorpusListener(CorpusListener l) {
    Vector v = corpusListeners == null
            ? new Vector(2)
            : (Vector)corpusListeners.clone();
    if(!v.contains(l)) {
      corpusListeners = v;

   * Custom duplication for a corpus - duplicate this corpus in the
   * usual way, then duplicate the documents in this corpus and add them
   * to the duplicate.
  public Resource duplicate(Factory.DuplicationContext ctx)
          throws ResourceInstantiationException {
    Corpus newCorpus = (Corpus)Factory.defaultDuplicate(this, ctx);
    for(Document d : this) {
      newCorpus.add((Document)Factory.duplicate(d, ctx));
    return newCorpus;

  /** Freeze the serialization UID. */
  static final long serialVersionUID = -1113142759053898456L;

  private transient Vector corpusListeners;

  protected transient List documentsList;

  protected void fireDocumentAdded(CorpusEvent e) {
    if(corpusListeners != null) {
      Vector listeners = corpusListeners;
      int count = listeners.size();
      for(int i = 0; i < count; i++) {

  protected void fireDocumentRemoved(CorpusEvent e) {
    if(corpusListeners != null) {
      Vector listeners = corpusListeners;
      int count = listeners.size();
      for(int i = 0; i < count; i++) {

  @CreoleParameter(collectionElementType = Document.class, comment = "A list of GATE documents")
  public void setDocumentsList(java.util.List documentsList) {
    this.documentsList = documentsList;

  public java.util.List getDocumentsList() {
    return documentsList;

  public void resourceLoaded(CreoleEvent e) {

  public void resourceUnloaded(CreoleEvent e) {
    Resource res = e.getResource();
    // remove all occurences
    if(res instanceof Document) while(contains(res))

  public void resourceRenamed(Resource resource, String oldName, String newName) {

  public void datastoreOpened(CreoleEvent e) {

  public void datastoreCreated(CreoleEvent e) {

  public void datastoreClosed(CreoleEvent e) {
} // class CorpusImpl

© 2015 - 2024 Weber Informatics LLC | Privacy Policy