gate.DocumentFormat Maven / Gradle / Ivy

/*
 *  DocumentFormat.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Hamish Cunningham, 25/May/2000
 *
 *  $Id: DocumentFormat.java 19756 2016-11-19 01:55:44Z markagreenwood $
 */

package gate;

import gate.corpora.MimeType;
import gate.corpora.RepositioningInfo;
import gate.creole.AbstractLanguageResource;
import gate.event.StatusListener;
import gate.util.BomStrippingInputStreamReader;
import gate.util.DocumentFormatException;
import gate.util.GateException;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Vector;

import org.apache.commons.io.IOUtils;

/** The format of Documents. Subclasses of DocumentFormat know about
  * particular MIME types and how to unpack the information in any
  * markup or formatting they contain into GATE annotations. Each MIME
  * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
  * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
  * with a static index residing here when they are constructed. Static
  * getDocumentFormat methods can then be used to get the appropriate
  * format class for a particular document.
  */
public abstract class DocumentFormat
extends AbstractLanguageResource {
  
  private static final long serialVersionUID = 4147880563349143923L;

  /** The MIME type of this format. */
  private MimeType mimeType = null;

  /** Map of MimeTypeString to ClassHandler class. This is used to find the
    * language resource that deals with the specific Document format
    */
  protected static final Map
          mimeString2ClassHandlerMap = new HashMap();
  /** Map of MimeType to DocumentFormat Class. This is used to find the
    * DocumentFormat subclass that deals with a particular MIME type.
    */
  protected static final Map
          mimeString2mimeTypeMap = new HashMap();

  /** Map of Set of file suffixes to MimeType. This is used to figure
    * out what MIME type a document is from its file name.
    */
  protected static final Map
          suffixes2mimeTypeMap = new HashMap();

  /** Map of Set of magic numbers to MimeType. This is used to guess the
    * MIME type of a document, when we don't have any other clues.
    */
  protected static final Map
          magic2mimeTypeMap = new HashMap();

  /** Map of markup elements to annotation types. If it is null, the
    * unpackMarkup() method will convert all markup, using the element names
    * for annotation types. If it is non-null, only those elements specified
    * here will be converted.
    */
  protected Map markupElementsMap = null;

  /** This map is used inside uppackMarkup() method...
    * When an element from the map is encounted, The corresponding string
    * element is added to the document content
    */
  protected Map element2StringMap = null;

  /** The features of this resource */
  private FeatureMap features = null;

  /** Default construction */
  public DocumentFormat() {}

  /** listeners for status report */
  private transient Vector statusListeners;

  /** Flag for enable/disable collecting of repositioning information */
  private Boolean shouldCollectRepositioning = Boolean.FALSE;

  /** If the document format could collect repositioning information
   *  during the unpack phase this method will return true.
   *  

   *  You should override this method in the child class of the defined
   *  document format if it could collect the repositioning information.
   */
  public Boolean supportsRepositioning() {
    return Boolean.FALSE;
  } // supportsRepositioning

  public void setShouldCollectRepositioning(Boolean b) {
    if(supportsRepositioning().booleanValue() && b.booleanValue()) {
      shouldCollectRepositioning = b;
    }
    else {
      shouldCollectRepositioning = Boolean.FALSE;
    } // if
  } // setShouldCollectRepositioning

  public Boolean getShouldCollectRepositioning() {
    return shouldCollectRepositioning;
  } //

  /** Unpack the markup in the document. This converts markup from the
    * native format (e.g. XML, RTF) into annotations in GATE format.
    * Uses the markupElementsMap to determine which elements to convert, and
    * what annotation type names to use.
    */
  abstract public void unpackMarkup(Document doc)
                                      throws DocumentFormatException;

  abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo,
                                        RepositioningInfo ampCodingInfo)
                                      throws DocumentFormatException;
  /** Unpack the markup in the document. This method calls unpackMarkup on the
    * GATE document, but after it saves its content as a feature attached to
    * the document. This method is useful if one wants to save the content
    * of the document being unpacked. After the markups have been unpacked,
    * the content of the document will be replaced with a new one containing
    * the text between markups.
    *
    * @param doc the document that will be unpacked
    * @param originalContentFeatureType the name of the feature that will hold
    * the document's content.
    */
  public void unpackMarkup( Document doc,
                            String  originalContentFeatureType )
                                              throws DocumentFormatException{
     FeatureMap fm = doc.getFeatures();
     if (fm == null) fm = Factory.newFeatureMap();
     fm.put(originalContentFeatureType, doc.getContent().toString());
     doc.setFeatures(fm);
     unpackMarkup(doc);
  }// unpackMarkup();

  /**
    * Returns a MimeType having as input a fileSufix.
    * If the file sufix is null or not recognised then,
    * null will be returned.
    * @param fileSufix The file sufix associated with a recognisabe mime type.
    * @return The MimeType associated with this file suffix.
    */
  static private MimeType  getMimeType(String fileSufix){
    // Get a mimeType string associated with this fileSuffix
    // Eg: for html returns  MimeType("text/html"), for xml returns
    // MimeType("text/xml")
    if(fileSufix == null) return null;
    return  suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
  }//getMimeType
  
  public static Set getSupportedMimeTypes() {
    return Collections.unmodifiableSet(mimeString2mimeTypeMap.keySet());
  }

  /**
    * Returns a MymeType having as input a URL object. If the MimeType wasn't
    * recognized it returns null.
    * @param url The URL object from which the MimeType will be extracted
    * @return A MimeType object for that URL, or null if the Mime Type is
    * unknown.
    */
  static private MimeType  getMimeType(URL url) {
    String mimeTypeString = null;
    String charsetFromWebServer = null;
    String contentType = null;
    InputStream is = null;
    MimeType mimeTypeFromWebServer = null;
    MimeType mimeTypeFromFileSuffix = null;
    MimeType mimeTypeFromMagicNumbers = null;

    if (url == null)
      return null;
    // Ask the web server for the content type
    // We expect to get contentType something like this:
    // "text/html; charset=iso-8859-1"
    // Charset is optional

    try {
    try{
      URLConnection urlconn = url.openConnection();
      is = urlconn.getInputStream();
      contentType = urlconn.getContentType();
    } catch (IOException e){
      // Failed to get the content type with te Web server.
      // Let's try some other methods like FileSuffix or magic numbers.
    }
    // If a content Type was returned by the server, try to get the mime Type
    // string
    // If contentType is something like this:"text/html; charset=iso-8859-1"
    // try to get content Type string (text/html)
    if (contentType != null){
      StringTokenizer st = new StringTokenizer(contentType, ";");
      // We assume that the first token is the mime type string...
      // If this doesn't happen then BAD LUCK :(( ...
      if (st.hasMoreTokens())
        mimeTypeString     = st.nextToken().toLowerCase();
      // The next token it should be the CharSet
      if (st.hasMoreTokens())
        charsetFromWebServer = st.nextToken().toLowerCase();
      if (charsetFromWebServer != null){
        //We have something like : "charset=iso-8859-1" and let's extract the
        // encoding.
        st = new StringTokenizer(charsetFromWebServer, "=");
        // Don't need this anymore
        charsetFromWebServer = null;
        // Discarding the first token which is : "charset"
        if (st.hasMoreTokens())
          st.nextToken();
        // Get the encoding : "ISO-8859-1"
        if (st.hasMoreTokens())
          charsetFromWebServer = st.nextToken().toUpperCase();
      } // End if
    }// end if
    // Return the corresponding MimeType with WebServer from the associated MAP
    mimeTypeFromWebServer = mimeString2mimeTypeMap.get(mimeTypeString);
    // Let's try a file suffix detection
    // mimeTypeFromFileSuffix = getMimeType(getFileSuffix(url));    
    for(String suffix : getFileSuffixes(url)) {
      mimeTypeFromFileSuffix = getMimeType(suffix);
      if(mimeTypeFromFileSuffix != null) break;
    }

    // Let's perform a magic numbers guess..
    mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
                                                    charsetFromWebServer);
    }
    finally {
      IOUtils.closeQuietly(is); //null safe
    }
    //All those types enter into a deciding system
    return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
                                        mimeTypeFromFileSuffix,
                                        mimeTypeFromMagicNumbers);
  }//getMimeType

  /**
    * This method decides what mimeType is in majority
    * @param aMimeTypeFromWebServer a MimeType
    * @param aMimeTypeFromFileSuffix a MimeType
    * @param aMimeTypeFromMagicNumbers a MimeType
    * @return the MimeType which occurs most. If all are null, then returns
    * null
    */
  protected static MimeType decideBetweenThreeMimeTypes(
                                    MimeType aMimeTypeFromWebServer,
                                    MimeType aMimeTypeFromFileSuffix,
                                    MimeType aMimeTypeFromMagicNumbers){
    
    // First a voting system
    if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
      return aMimeTypeFromFileSuffix;
    if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
      return aMimeTypeFromFileSuffix;
    if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
      return aMimeTypeFromWebServer;

    // 1 is the highest priority
    if (aMimeTypeFromFileSuffix != null)
      aMimeTypeFromFileSuffix.addParameter("Priority","1");
    // 2 is the second priority
    if (aMimeTypeFromWebServer != null)
      aMimeTypeFromWebServer.addParameter("Priority","2");
    // 3 is the third priority
    if (aMimeTypeFromMagicNumbers != null)
      aMimeTypeFromMagicNumbers.addParameter("Priority","3");

    return decideBetweenTwoMimeTypes(
                             decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
                                                       aMimeTypeFromFileSuffix),
                             aMimeTypeFromMagicNumbers);

  }// decideBetweenThreeMimeTypes

  /** Decide between two mimeTypes. The decistion is made on "Priority"
    * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes
    * doesn't have "Priority" paramether set, it will return one on them.
    * @param aMimeType a MimeType object with "Prority" parameter set
    * @param anotherMimeType a MimeType object with "Prority" parameter set
    * @return One of the two mime types.
    */
  protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
                                                MimeType anotherMimeType){
    if (aMimeType == null) return anotherMimeType;
    if (anotherMimeType == null) return aMimeType;

    int priority1 = 0;
    int priority2 = 0;
    // Both of them are not null
    if (aMimeType.hasParameter("Priority"))
      try{
        priority1 =
              Integer.parseInt(aMimeType.getParameterValue("Priority"));
      }catch (NumberFormatException e){
        return anotherMimeType;
      }
    if (anotherMimeType.hasParameter("Priority"))
      try{
        priority2 =
          Integer.parseInt(anotherMimeType.getParameterValue("Priority"));
      }catch (NumberFormatException e){
        return aMimeType;
      }

    // The lower the number, the highest the priority
    if (priority1 <= priority2)
      return aMimeType;
    else
      return anotherMimeType;
  }// decideBetweenTwoMimeTypes

  /**
    * Tests if two MimeType objects are equal.
    * @return true only if boths MimeType objects are different than null
    * and their Types and Subtypes are equals. The method is case sensitive.
    */
  protected static boolean areEqual( MimeType aMimeType,
                                     MimeType anotherMimeType){
    if (aMimeType == null || anotherMimeType == null)
      return false;

    if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
         aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
       ) return true;
    else
      return false;
  }// are Equal

  /**
    * This method tries to guess the mime Type using some magic numbers.
    * @param aInputStream a InputStream which has to be transformed into a
    *        InputStreamReader
    * @param anEncoding the encoding. If is null or unknown then a
    * InputStreamReader with default encodings will be created.
    * @return the mime type associated with magic numbers
    */
  protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
                                                            String anEncoding){

    if (aInputStream == null) return null;
    Reader reader = null;
    if (anEncoding != null)
      try{
        reader = new BomStrippingInputStreamReader(aInputStream, anEncoding);
      } catch (UnsupportedEncodingException e){
        reader = null;
      }
    if (reader == null)
      // Create a reader with the default encoding system
      reader = new BomStrippingInputStreamReader(aInputStream);

    // We have a input stream reader
    return runMagicNumbers(reader);
  }//guessTypeUsingMagicNumbers

  /** Performs magic over Gate Document */
  protected static MimeType runMagicNumbers(Reader aReader) {
    // No reader, nothing to detect
    if( aReader == null) return null;

    // Prepare to run the magic stuff
    String strBuffer = null;
    int bufferSize = 2048;
    int charReads = 0;
    char[] cbuf = new char[bufferSize];

    try {
      charReads = aReader.read(cbuf,0,bufferSize);
    } catch (IOException e){
      return null;
    }// End try

    if (charReads == -1)
      // the document is empty
      return null;

    // Create a string form the buffer and perform some search on it.
    strBuffer = new String(cbuf,0,charReads);

    // If this fails then surrender
    return getTypeFromContent(strBuffer);
  }// runMagicNumbers

  private static MimeType getTypeFromContent(String aContent){

    // change case to cover more variants
    aContent = aContent.toLowerCase();

    // the mime type we have detected (null to start with)
    MimeType detectedMimeType = null;

    // the offset of the first match now we use a "first wins" priority
    int firstOffset = Integer.MAX_VALUE;
    
    MimeType xmlMime = getMimeType("xml");    

    // Run the magic numbers test
    for(Map.Entry kv : magic2mimeTypeMap.entrySet()) {
      // the magic code we are looking for
      String magic = kv.getKey().toLowerCase();

      // the offset of this code in the content
      int offset = aContent.indexOf(magic.toLowerCase());
      if(offset != -1 && (offset < firstOffset || (!kv.getValue().equals(xmlMime) && detectedMimeType.equals(xmlMime)))) {
        // if the magic code exists in the doc and appears before any others
        // than use that mime type
        detectedMimeType = kv.getValue();
        firstOffset = offset;
      }
    }

    // return the mime type (null if we failed)
    return detectedMimeType;
  }

  /**
    * Return the fileSuffix or null if the url doesn't have a file suffix
    * If the url is null then the file suffix will be null also
    */
  @SuppressWarnings("unused")
  private static String getFileSuffix(URL url){
    String fileName = null;
    String fileSuffix = null;

    // GIGO test  (garbage in garbage out)
    if (url != null){
      // get the file name from the URL
      fileName = url.getFile();

      // tokenize this file name with "." as separator...
      // the last token will be the file suffix
      StringTokenizer st = new StringTokenizer(fileName,".");

      // fileSuffix is the last token
      while (st.hasMoreTokens())
        fileSuffix = st.nextToken();
      // here fileSuffix is the last token
    } // End if
    return fileSuffix;
  }//getFileSufix

  /**
   * Given a URL, this method returns all the 'file extensions' for the file
   * part of the URL. For this purposes, a 'file extension' is any sequence of
   * .-separated tokens (such as .gate.xml.gz). The order the extensions are 
   * returned in is from the most specific (longest) to the most generic 
   * (shortest) one, e.g. [.gate.xml.gz, .xml.gz, .gz]. 
   */
  private static List getFileSuffixes(URL url){
    List res = new LinkedList();
    if (url != null){
      // get the file name from the URL
      String fileName = url.getPath();
      int pos = fileName.lastIndexOf('/');
      if(pos  > 0) fileName = fileName.substring(pos);
      pos = fileName.indexOf('.', 1);
      while(pos > 0 && pos < fileName.length() - 1) {
        res.add(fileName.substring(pos + 1));
        pos = fileName.indexOf('.', pos + 1);
      }
    }
    return res;
  }
  
  
  /**
    * Find a DocumentFormat implementation that deals with a particular
    * MIME type, given that type.
    * @param  aGateDocument this document will receive as a feature
    *                      the associated Mime Type. The name of the feature is
    *                      MimeType and its value is in the format type/subtype
    * @param  mimeType the mime type that is given as input
    */
  static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
                                                            MimeType mimeType){
    FeatureMap      aFeatureMap    = null;
    if(mimeType == null) {
      String content = aGateDocument.getContent().toString();
      // reduce size for better performance
      if(content.length() > 2048) content = content.substring(0, 2048);
      mimeType = getTypeFromContent( content );
    }

    if (mimeType != null){
      // If the Gate Document doesn't have a feature map atached then
      // We will create and set one.
      if(aGateDocument.getFeatures() == null){
            aFeatureMap = Factory.newFeatureMap();
            aGateDocument.setFeatures(aFeatureMap);
      }// end if
      aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
                                          mimeType.getSubtype());

      return mimeString2ClassHandlerMap.get(mimeType.getType()
                                               + "/" + mimeType.getSubtype());
    }// end If
    return null;
  } // getDocumentFormat(aGateDocument, MimeType)

  /**
    * Find a DocumentFormat implementation that deals with a particular
    * MIME type, given the file suffix (e.g. ".txt") that the document came
    * from.
    * @param  aGateDocument this document will receive as a feature
    *                     the associated Mime Type. The name of the feature is
    *                     MimeType and its value is in the format type/subtype
    * @param  fileSuffix the file suffix that is given as input
    */
  static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
                                                            String fileSuffix) {
    return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
  } // getDocumentFormat(String)
  
  /**
   * Find the DocumentFormat implementation that deals with the given
   * MIME type.
   * 
   * @param mimeType the MIME type you want the DocumentFormat for
   * @return the DocumentFormat associated with the MIME type or null if
   *         the MIME type does not have a registered DocumentFormat
   */
  public static DocumentFormat getDocumentFormat(MimeType mimeType) {
    return mimeString2ClassHandlerMap.get(mimeType.getType() + "/"
            + mimeType.getSubtype());
  }

  /**
    * Find a DocumentFormat implementation that deals with a particular
    * MIME type, given the URL of the Document. If it is an HTTP URL, we
    * can ask the web server. If it has a recognised file extension, we
    * can use that. Otherwise we need to use a map of magic numbers
    * to MIME types to guess the type, and then look up the format using the
    * type.
    * @param  aGateDocument this document will receive as a feature
    *                      the associated Mime Type. The name of the feature is
    *                      MimeType and its value is in the format type/subtype
    * @param  url  the URL that is given as input
    */
  static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
                                                                      URL url) {
    return getDocumentFormat(aGateDocument, getMimeType(url));
  } // getDocumentFormat(URL)
  
  /** Get the feature set */
  @Override
  public FeatureMap getFeatures() { return features; }

   /** Get the markup elements map */
  public Map getMarkupElementsMap() { return markupElementsMap; }

   /** Get the element 2 string map */
  public Map getElement2StringMap() { return element2StringMap; }

  /** Set the markup elements map */
  public void setMarkupElementsMap(Map markupElementsMap) {
   this.markupElementsMap = markupElementsMap;
  }

  /** Set the element 2 string map */
  public void setElement2StringMap(Map anElement2StringMap) {
   element2StringMap = anElement2StringMap;
  }

  /** Set the features map*/
  @Override
  public void setFeatures(FeatureMap features){this.features = features;}

  /** Set the mime type*/

  public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
  /** Gets the mime Type*/
  public MimeType getMimeType(){return mimeType;}


  /**
   * Utility method to get a {@link MimeType} given the type string.
   */
  public static MimeType getMimeTypeForString(String typeString) {
    return mimeString2mimeTypeMap.get(typeString);
  }

  /**
   * Utility method to get the set of all file suffixes that are registered
   * with this class.
   */
  public static Set getSupportedFileSuffixes() {
    return Collections.unmodifiableSet(suffixes2mimeTypeMap.keySet());
  }

  //StatusReporter Implementation


  public synchronized void removeStatusListener(StatusListener l) {
    if (statusListeners != null && statusListeners.contains(l)) {
      @SuppressWarnings("unchecked")
      Vector v = (Vector) statusListeners.clone();
      v.removeElement(l);
      statusListeners = v;
    }
  }
  public synchronized void addStatusListener(StatusListener l) {
    @SuppressWarnings("unchecked")
    Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
    if (!v.contains(l)) {
      v.addElement(l);
      statusListeners = v;
    }
  }
  protected void fireStatusChanged(String e) {
    if (statusListeners != null) {
      
      int count = statusListeners.size();
      for (int i = 0; i < count; i++) {
        statusListeners.elementAt(i).statusChanged(e);
      }
    }
  }

} // class DocumentFormat