gate.corpora.TextualDocumentFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.
The newest version!
/*
 *  TextualDocumentFormat.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Cristian URSU, 26/May/2000
 *
 *  $Id: TextualDocumentFormat.java 19663 2016-10-10 08:44:57Z markagreenwood $
 */

package gate.corpora;

import java.io.IOException;

import gate.*;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.util.DocumentFormatException;

//import org.w3c.www.mime.*;

/** The format of Documents. Subclasses of DocumentFormat know about
  * particular MIME types and how to unpack the information in any
  * markup or formatting they contain into GATE annotations. Each MIME
  * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
  * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
  * with a static index residing here when they are constructed. Static
  * getDocumentFormat methods can then be used to get the appropriate
  * format class for a particular document.
  */
@CreoleResource(name = "GATE Textual Document Format", isPrivate = true,
    autoinstances = {@AutoInstance(hidden = true)})
public class TextualDocumentFormat extends DocumentFormat
{
  private static final long serialVersionUID = -5630380244338599927L;

  /** Default construction */
  public TextualDocumentFormat() { super(); }

  /** Initialise this resource, and return it. */
  @Override
  public Resource init() throws ResourceInstantiationException{
    // Register plain text mime type
    MimeType mime = new MimeType("text","plain");
    // Register the class handler for this mime type
    mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
                                                                          this);
    // Register the mime type with mine string
    mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
    // Register file sufixes for this mime type
    suffixes2mimeTypeMap.put("txt",mime);
    suffixes2mimeTypeMap.put("text",mime);
    // Set the mimeType for this language resource
    setMimeType(mime);
    return this;
  } // init()

  /** Unpack the markup in the document. This converts markup from the
    * native format (e.g. XML, RTF) into annotations in GATE format.
    * Uses the markupElementsMap to determine which elements to convert, and
    * what annotation type names to use.
    */
  @Override
  public void unpackMarkup(Document doc) throws DocumentFormatException{
    if (doc == null || doc.getContent() == null) return;
    setNewLineProperty(doc);
    
    // for some reason old stlye Mac documents with CR line endings don't
    // display properly in GATE so once we know we have a CR line terminated
    // file, we replace all the \r with \n to make sure it displays correctly.
    //
    // Documentation for JTextArea suggests we don't need to do this as it
    // handles new lines properly, but that's only true when using the read
    // methods not when using setText() which is how we get the document content
    // in to the GUI
    if("CR"
        .equals(doc.getFeatures().get(GateConstants.DOCUMENT_NEW_LINE_TYPE))) {
      String content = doc.getContent().toString();
      content = content.replace('\r', '\n');
      doc.setContent(new DocumentContentImpl(content));
    }
    
    // Create paragraph annotations in the specified annotation set
    int endOffset = doc.getContent().toString().length();
    int startOffset = 0;
    annotateParagraphs(doc,startOffset,endOffset,
                                GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
  }//unpackMarkup

  @Override
  public void unpackMarkup(Document doc, RepositioningInfo repInfo,
                            RepositioningInfo ampCodingInfo)
                                      throws DocumentFormatException {
    unpackMarkup(doc);
  } // unpackMarkup
  
  /**
   * This is a test to see if the GATE document has a valid URL or a
   * valid content.
   * 
   * @param doc
   * @throws DocumentFormatException
   */
  protected static boolean hasContentButNoValidUrl(Document doc)
          throws DocumentFormatException {
    try {
      if(doc.getSourceUrl() == null && doc.getContent() != null) {
        // The doc's url is null but there is a content.
        return true;
      }
      else {
        doc.getSourceUrl().openConnection();
      }
    }
    catch(IOException ex1) {
      // The URL is not null but is not valid.
      if(doc.getContent() == null)
      // The document content is also null. There is nothing we can do.
        throw new DocumentFormatException("The document doesn't have a"
                + " valid URL and also no content");
      return true;
    }// End try

    return false;
  }


  /**
   * Check the new line sequence and set document property.
   * 

   * Possible values are CRLF, LFCR, CR, LF
   */
  protected void setNewLineProperty(Document doc) {
    String content = doc.getContent().toString();
    String newLineType = "";

    char ch = ' ';
    char lastch = ' ';
    for(int i=0; i < content.length(); ++i) {
      ch = content.charAt(i);
      if(lastch == '\r') {
        if(ch == '\n') {
          newLineType = "CRLF";
          break;
        }
        else {
          newLineType = "CR";
          break;
        }
      }
      if(lastch == '\n') {
        if(ch == '\r') {
          newLineType = "LFCR";
          break;
        }
        else {
          newLineType = "LF";
          break;
        }
      }
      lastch = ch;
    } // for

    doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
  } // setNewLineProperty()

  /** Delete '\r' in combination CRLF or LFCR in document content */
  @SuppressWarnings("unused")
  private void removeExtraNewLine(Document doc) {
    String content = doc.getContent().toString();
    StringBuffer buff = new StringBuffer(content);

    char ch = ' ';
    char lastch = ' ';
    for(int i=content.length()-1; i > -1; --i) {
      ch = content.charAt(i);
      if(ch == '\n' && lastch == '\r') {
        buff.deleteCharAt(i+1);
      }
      if(ch == '\r' && lastch == '\n') {
        buff.deleteCharAt(i);
        ch = lastch;
      }
      lastch = ch;
    } // for

    doc.setContent(new DocumentContentImpl(buff.toString()));
  } // removeExtraNewLine(Document doc)

  /** This method annotates paragraphs in a GATE document. The investigated text
    * spans beetween start and end offsets and the paragraph annotations are
    * created in the annotSetName. If annotSetName is null then they are creted
    * in the default annotation set.
    * @param aDoc is the gate document on which the paragraph detection would
    *  be performed.If it is null or its content it's null then the method woul
    *  simply return doing nothing.
    * @param startOffset is the index  form the document content from which the
    * paragraph detection will start
    * @param endOffset is the offset where the detection will end.
    * @param annotSetName is the name of the set in which paragraph annotation
    * would be created.The annotation type created will be "paragraph"
    */
  public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
                            String annotSetName)throws DocumentFormatException{
    // Simply return if the document is null or its content
    if (aDoc == null || aDoc.getContent() == null) return;
    // Simply return if the start is > than the end
    if (startOffset > endOffset) return;
    // Decide where to put the newly detected annotations
    AnnotationSet annotSet = null;
    if (annotSetName == null)
      annotSet = aDoc.getAnnotations();
    else
      annotSet = aDoc.getAnnotations(annotSetName);
    // Extract the document content
    String content = aDoc.getContent().toString();
    // This is the offset marking the start of a para
    int startOffsetPara = startOffset;
    // This marks the ned of a para
    int endOffsetPara = endOffset;
    // The initial sate of the FSA
    int state = 1;
    // This field marks that a BR entity was read
    // A BR entity can be NL or NL CR, depending on the operating system (UNIX
    // or DOS)
    boolean readBR = false;
    int index = startOffset;
    while (index < endOffset){
      // Read the current char
      char ch = content.charAt(index);
      // Test if a BR entity was read
      if (ch =='\n'){
        readBR = true;
        // If \n is followed by a \r then advance the index in order to read a
        // BR entity
        while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
          index ++;
      }// End if
      switch(state){
        // It is the initial and also a final state
        // Stay in state 1 while it reads whitespaces
        case 1:{
          // If reads a non whitespace char then move to state 2 and record
          // the beggining of a paragraph
          if (!Character.isWhitespace(ch)){
            state = 2;
            startOffsetPara = index;
          }// End if
        }break;
        // It can be also a final state.
        case 2:{
          // Stay in state 2 while reading chars != BR entities
          if (readBR){
            // If you find a BR char go to state 3. The possible end of the para
            // can be index. This will be confirmed by state 3. So, this is why
            // the end of a para is recorded here.
            readBR = false;
            endOffsetPara = index;
            state = 3;
          }// End if
        }break;
        // It can be also a final state
        // From state 3 there are only 2 possible ways: (state 2 or state1)
        // In state 1 it needs to read a BR
        // For state 2 it nead to read something different then a BR
        case 3:{
          if (readBR){
            // A BR was read. Go to state 1
            readBR = false;
            state = 1;
            // Create an annotation type paragraph
            try{
              annotSet.add(Long.valueOf(startOffsetPara),
                            Long.valueOf(endOffsetPara),
                            "paragraph",
                            Factory.newFeatureMap());
            } catch (gate.util.InvalidOffsetException ioe){
              throw new DocumentFormatException("Coudn't create a paragraph"+
              " annotation",ioe);
            }// End try
          }else{
            // Go to state 2 an keep reading chars
            state = 2;
          }// End if
        }break;
      }// End switch
      // Prepare to read the next char.
      index ++;
    }// End while
    endOffsetPara = index;
    // Investigate where the finite automata has stoped
    if ( state==2 || state==3 ){
      // Create an annotation type paragraph
      try{
        annotSet.add( Long.valueOf(startOffsetPara),
                      // Create the final annotation using the endOffset
                      Long.valueOf(endOffsetPara),
                      "paragraph",
                      Factory.newFeatureMap());
      } catch (gate.util.InvalidOffsetException ioe){
              throw new DocumentFormatException("Coudn't create a paragraph"+
              " annotation",ioe);
      }// End try
    }// End if
  }// End annotateParagraphs();

  @Override
  public DataStore getDataStore(){ return null;}

} // class TextualDocumentFormat