All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.DocumentFormat Maven / Gradle / Ivy

/*
 *  DocumentFormat.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Hamish Cunningham, 25/May/2000
 *
 *  $Id: DocumentFormat.java 19756 2016-11-19 01:55:44Z markagreenwood $
 */

package gate;

import gate.corpora.MimeType;
import gate.corpora.RepositioningInfo;
import gate.creole.AbstractLanguageResource;
import gate.event.StatusListener;
import gate.util.BomStrippingInputStreamReader;
import gate.util.DocumentFormatException;
import gate.util.GateException;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Vector;

import org.apache.commons.io.IOUtils;

/** The format of Documents. Subclasses of DocumentFormat know about
  * particular MIME types and how to unpack the information in any
  * markup or formatting they contain into GATE annotations. Each MIME
  * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
  * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
  * with a static index residing here when they are constructed. Static
  * getDocumentFormat methods can then be used to get the appropriate
  * format class for a particular document.
  */
public abstract class DocumentFormat
extends AbstractLanguageResource {
  
  private static final long serialVersionUID = 4147880563349143923L;

  /** The MIME type of this format. */
  private MimeType mimeType = null;

  /** Map of MimeTypeString to ClassHandler class. This is used to find the
    * language resource that deals with the specific Document format
    */
  protected static final Map
          mimeString2ClassHandlerMap = new HashMap();
  /** Map of MimeType to DocumentFormat Class. This is used to find the
    * DocumentFormat subclass that deals with a particular MIME type.
    */
  protected static final Map
          mimeString2mimeTypeMap = new HashMap();

  /** Map of Set of file suffixes to MimeType. This is used to figure
    * out what MIME type a document is from its file name.
    */
  protected static final Map
          suffixes2mimeTypeMap = new HashMap();

  /** Map of Set of magic numbers to MimeType. This is used to guess the
    * MIME type of a document, when we don't have any other clues.
    */
  protected static final Map
          magic2mimeTypeMap = new HashMap();

  /** Map of markup elements to annotation types. If it is null, the
    * unpackMarkup() method will convert all markup, using the element names
    * for annotation types. If it is non-null, only those elements specified
    * here will be converted.
    */
  protected Map markupElementsMap = null;

  /** This map is used inside uppackMarkup() method...
    * When an element from the map is encounted, The corresponding string
    * element is added to the document content
    */
  protected Map element2StringMap = null;

  /** The features of this resource */
  private FeatureMap features = null;

  /** Default construction */
  public DocumentFormat() {}

  /** listeners for status report */
  private transient Vector statusListeners;

  /** Flag for enable/disable collecting of repositioning information */
  private Boolean shouldCollectRepositioning = Boolean.FALSE;

  /** If the document format could collect repositioning information
   *  during the unpack phase this method will return true.
   *  
* You should override this method in the child class of the defined * document format if it could collect the repositioning information. */ public Boolean supportsRepositioning() { return Boolean.FALSE; } // supportsRepositioning public void setShouldCollectRepositioning(Boolean b) { if(supportsRepositioning().booleanValue() && b.booleanValue()) { shouldCollectRepositioning = b; } else { shouldCollectRepositioning = Boolean.FALSE; } // if } // setShouldCollectRepositioning public Boolean getShouldCollectRepositioning() { return shouldCollectRepositioning; } // /** Unpack the markup in the document. This converts markup from the * native format (e.g. XML, RTF) into annotations in GATE format. * Uses the markupElementsMap to determine which elements to convert, and * what annotation type names to use. */ abstract public void unpackMarkup(Document doc) throws DocumentFormatException; abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException; /** Unpack the markup in the document. This method calls unpackMarkup on the * GATE document, but after it saves its content as a feature attached to * the document. This method is useful if one wants to save the content * of the document being unpacked. After the markups have been unpacked, * the content of the document will be replaced with a new one containing * the text between markups. * * @param doc the document that will be unpacked * @param originalContentFeatureType the name of the feature that will hold * the document's content. */ public void unpackMarkup( Document doc, String originalContentFeatureType ) throws DocumentFormatException{ FeatureMap fm = doc.getFeatures(); if (fm == null) fm = Factory.newFeatureMap(); fm.put(originalContentFeatureType, doc.getContent().toString()); doc.setFeatures(fm); unpackMarkup(doc); }// unpackMarkup(); /** * Returns a MimeType having as input a fileSufix. * If the file sufix is null or not recognised then, * null will be returned. * @param fileSufix The file sufix associated with a recognisabe mime type. * @return The MimeType associated with this file suffix. */ static private MimeType getMimeType(String fileSufix){ // Get a mimeType string associated with this fileSuffix // Eg: for html returns MimeType("text/html"), for xml returns // MimeType("text/xml") if(fileSufix == null) return null; return suffixes2mimeTypeMap.get(fileSufix.toLowerCase()); }//getMimeType public static Set getSupportedMimeTypes() { return Collections.unmodifiableSet(mimeString2mimeTypeMap.keySet()); } /** * Returns a MymeType having as input a URL object. If the MimeType wasn't * recognized it returns null. * @param url The URL object from which the MimeType will be extracted * @return A MimeType object for that URL, or null if the Mime Type is * unknown. */ static private MimeType getMimeType(URL url) { String mimeTypeString = null; String charsetFromWebServer = null; String contentType = null; InputStream is = null; MimeType mimeTypeFromWebServer = null; MimeType mimeTypeFromFileSuffix = null; MimeType mimeTypeFromMagicNumbers = null; if (url == null) return null; // Ask the web server for the content type // We expect to get contentType something like this: // "text/html; charset=iso-8859-1" // Charset is optional try { try{ URLConnection urlconn = url.openConnection(); is = urlconn.getInputStream(); contentType = urlconn.getContentType(); } catch (IOException e){ // Failed to get the content type with te Web server. // Let's try some other methods like FileSuffix or magic numbers. } // If a content Type was returned by the server, try to get the mime Type // string // If contentType is something like this:"text/html; charset=iso-8859-1" // try to get content Type string (text/html) if (contentType != null){ StringTokenizer st = new StringTokenizer(contentType, ";"); // We assume that the first token is the mime type string... // If this doesn't happen then BAD LUCK :(( ... if (st.hasMoreTokens()) mimeTypeString = st.nextToken().toLowerCase(); // The next token it should be the CharSet if (st.hasMoreTokens()) charsetFromWebServer = st.nextToken().toLowerCase(); if (charsetFromWebServer != null){ //We have something like : "charset=iso-8859-1" and let's extract the // encoding. st = new StringTokenizer(charsetFromWebServer, "="); // Don't need this anymore charsetFromWebServer = null; // Discarding the first token which is : "charset" if (st.hasMoreTokens()) st.nextToken(); // Get the encoding : "ISO-8859-1" if (st.hasMoreTokens()) charsetFromWebServer = st.nextToken().toUpperCase(); } // End if }// end if // Return the corresponding MimeType with WebServer from the associated MAP mimeTypeFromWebServer = mimeString2mimeTypeMap.get(mimeTypeString); // Let's try a file suffix detection // mimeTypeFromFileSuffix = getMimeType(getFileSuffix(url)); for(String suffix : getFileSuffixes(url)) { mimeTypeFromFileSuffix = getMimeType(suffix); if(mimeTypeFromFileSuffix != null) break; } // Let's perform a magic numbers guess.. mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is, charsetFromWebServer); } finally { IOUtils.closeQuietly(is); //null safe } //All those types enter into a deciding system return decideBetweenThreeMimeTypes( mimeTypeFromWebServer, mimeTypeFromFileSuffix, mimeTypeFromMagicNumbers); }//getMimeType /** * This method decides what mimeType is in majority * @param aMimeTypeFromWebServer a MimeType * @param aMimeTypeFromFileSuffix a MimeType * @param aMimeTypeFromMagicNumbers a MimeType * @return the MimeType which occurs most. If all are null, then returns * null */ protected static MimeType decideBetweenThreeMimeTypes( MimeType aMimeTypeFromWebServer, MimeType aMimeTypeFromFileSuffix, MimeType aMimeTypeFromMagicNumbers){ // First a voting system if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix)) return aMimeTypeFromFileSuffix; if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers)) return aMimeTypeFromFileSuffix; if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers)) return aMimeTypeFromWebServer; // 1 is the highest priority if (aMimeTypeFromFileSuffix != null) aMimeTypeFromFileSuffix.addParameter("Priority","1"); // 2 is the second priority if (aMimeTypeFromWebServer != null) aMimeTypeFromWebServer.addParameter("Priority","2"); // 3 is the third priority if (aMimeTypeFromMagicNumbers != null) aMimeTypeFromMagicNumbers.addParameter("Priority","3"); return decideBetweenTwoMimeTypes( decideBetweenTwoMimeTypes(aMimeTypeFromWebServer, aMimeTypeFromFileSuffix), aMimeTypeFromMagicNumbers); }// decideBetweenThreeMimeTypes /** Decide between two mimeTypes. The decistion is made on "Priority" * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes * doesn't have "Priority" paramether set, it will return one on them. * @param aMimeType a MimeType object with "Prority" parameter set * @param anotherMimeType a MimeType object with "Prority" parameter set * @return One of the two mime types. */ protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType, MimeType anotherMimeType){ if (aMimeType == null) return anotherMimeType; if (anotherMimeType == null) return aMimeType; int priority1 = 0; int priority2 = 0; // Both of them are not null if (aMimeType.hasParameter("Priority")) try{ priority1 = Integer.parseInt(aMimeType.getParameterValue("Priority")); }catch (NumberFormatException e){ return anotherMimeType; } if (anotherMimeType.hasParameter("Priority")) try{ priority2 = Integer.parseInt(anotherMimeType.getParameterValue("Priority")); }catch (NumberFormatException e){ return aMimeType; } // The lower the number, the highest the priority if (priority1 <= priority2) return aMimeType; else return anotherMimeType; }// decideBetweenTwoMimeTypes /** * Tests if two MimeType objects are equal. * @return true only if boths MimeType objects are different than null * and their Types and Subtypes are equals. The method is case sensitive. */ protected static boolean areEqual( MimeType aMimeType, MimeType anotherMimeType){ if (aMimeType == null || anotherMimeType == null) return false; if ( aMimeType.getType().equals(anotherMimeType.getType()) && aMimeType.getSubtype().equals(anotherMimeType.getSubtype()) ) return true; else return false; }// are Equal /** * This method tries to guess the mime Type using some magic numbers. * @param aInputStream a InputStream which has to be transformed into a * InputStreamReader * @param anEncoding the encoding. If is null or unknown then a * InputStreamReader with default encodings will be created. * @return the mime type associated with magic numbers */ protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream, String anEncoding){ if (aInputStream == null) return null; Reader reader = null; if (anEncoding != null) try{ reader = new BomStrippingInputStreamReader(aInputStream, anEncoding); } catch (UnsupportedEncodingException e){ reader = null; } if (reader == null) // Create a reader with the default encoding system reader = new BomStrippingInputStreamReader(aInputStream); // We have a input stream reader return runMagicNumbers(reader); }//guessTypeUsingMagicNumbers /** Performs magic over Gate Document */ protected static MimeType runMagicNumbers(Reader aReader) { // No reader, nothing to detect if( aReader == null) return null; // Prepare to run the magic stuff String strBuffer = null; int bufferSize = 2048; int charReads = 0; char[] cbuf = new char[bufferSize]; try { charReads = aReader.read(cbuf,0,bufferSize); } catch (IOException e){ return null; }// End try if (charReads == -1) // the document is empty return null; // Create a string form the buffer and perform some search on it. strBuffer = new String(cbuf,0,charReads); // If this fails then surrender return getTypeFromContent(strBuffer); }// runMagicNumbers private static MimeType getTypeFromContent(String aContent){ // change case to cover more variants aContent = aContent.toLowerCase(); // the mime type we have detected (null to start with) MimeType detectedMimeType = null; // the offset of the first match now we use a "first wins" priority int firstOffset = Integer.MAX_VALUE; MimeType xmlMime = getMimeType("xml"); // Run the magic numbers test for(Map.Entry kv : magic2mimeTypeMap.entrySet()) { // the magic code we are looking for String magic = kv.getKey().toLowerCase(); // the offset of this code in the content int offset = aContent.indexOf(magic.toLowerCase()); if(offset != -1 && (offset < firstOffset || (!kv.getValue().equals(xmlMime) && detectedMimeType.equals(xmlMime)))) { // if the magic code exists in the doc and appears before any others // than use that mime type detectedMimeType = kv.getValue(); firstOffset = offset; } } // return the mime type (null if we failed) return detectedMimeType; } /** * Return the fileSuffix or null if the url doesn't have a file suffix * If the url is null then the file suffix will be null also */ @SuppressWarnings("unused") private static String getFileSuffix(URL url){ String fileName = null; String fileSuffix = null; // GIGO test (garbage in garbage out) if (url != null){ // get the file name from the URL fileName = url.getFile(); // tokenize this file name with "." as separator... // the last token will be the file suffix StringTokenizer st = new StringTokenizer(fileName,"."); // fileSuffix is the last token while (st.hasMoreTokens()) fileSuffix = st.nextToken(); // here fileSuffix is the last token } // End if return fileSuffix; }//getFileSufix /** * Given a URL, this method returns all the 'file extensions' for the file * part of the URL. For this purposes, a 'file extension' is any sequence of * .-separated tokens (such as .gate.xml.gz). The order the extensions are * returned in is from the most specific (longest) to the most generic * (shortest) one, e.g. [.gate.xml.gz, .xml.gz, .gz]. */ private static List getFileSuffixes(URL url){ List res = new LinkedList(); if (url != null){ // get the file name from the URL String fileName = url.getPath(); int pos = fileName.lastIndexOf('/'); if(pos > 0) fileName = fileName.substring(pos); pos = fileName.indexOf('.', 1); while(pos > 0 && pos < fileName.length() - 1) { res.add(fileName.substring(pos + 1)); pos = fileName.indexOf('.', pos + 1); } } return res; } /** * Find a DocumentFormat implementation that deals with a particular * MIME type, given that type. * @param aGateDocument this document will receive as a feature * the associated Mime Type. The name of the feature is * MimeType and its value is in the format type/subtype * @param mimeType the mime type that is given as input */ static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, MimeType mimeType){ FeatureMap aFeatureMap = null; if(mimeType == null) { String content = aGateDocument.getContent().toString(); // reduce size for better performance if(content.length() > 2048) content = content.substring(0, 2048); mimeType = getTypeFromContent( content ); } if (mimeType != null){ // If the Gate Document doesn't have a feature map atached then // We will create and set one. if(aGateDocument.getFeatures() == null){ aFeatureMap = Factory.newFeatureMap(); aGateDocument.setFeatures(aFeatureMap); }// end if aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" + mimeType.getSubtype()); return mimeString2ClassHandlerMap.get(mimeType.getType() + "/" + mimeType.getSubtype()); }// end If return null; } // getDocumentFormat(aGateDocument, MimeType) /** * Find a DocumentFormat implementation that deals with a particular * MIME type, given the file suffix (e.g. ".txt") that the document came * from. * @param aGateDocument this document will receive as a feature * the associated Mime Type. The name of the feature is * MimeType and its value is in the format type/subtype * @param fileSuffix the file suffix that is given as input */ static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, String fileSuffix) { return getDocumentFormat(aGateDocument, getMimeType(fileSuffix)); } // getDocumentFormat(String) /** * Find the DocumentFormat implementation that deals with the given * MIME type. * * @param mimeType the MIME type you want the DocumentFormat for * @return the DocumentFormat associated with the MIME type or null if * the MIME type does not have a registered DocumentFormat */ public static DocumentFormat getDocumentFormat(MimeType mimeType) { return mimeString2ClassHandlerMap.get(mimeType.getType() + "/" + mimeType.getSubtype()); } /** * Find a DocumentFormat implementation that deals with a particular * MIME type, given the URL of the Document. If it is an HTTP URL, we * can ask the web server. If it has a recognised file extension, we * can use that. Otherwise we need to use a map of magic numbers * to MIME types to guess the type, and then look up the format using the * type. * @param aGateDocument this document will receive as a feature * the associated Mime Type. The name of the feature is * MimeType and its value is in the format type/subtype * @param url the URL that is given as input */ static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, URL url) { return getDocumentFormat(aGateDocument, getMimeType(url)); } // getDocumentFormat(URL) /** Get the feature set */ @Override public FeatureMap getFeatures() { return features; } /** Get the markup elements map */ public Map getMarkupElementsMap() { return markupElementsMap; } /** Get the element 2 string map */ public Map getElement2StringMap() { return element2StringMap; } /** Set the markup elements map */ public void setMarkupElementsMap(Map markupElementsMap) { this.markupElementsMap = markupElementsMap; } /** Set the element 2 string map */ public void setElement2StringMap(Map anElement2StringMap) { element2StringMap = anElement2StringMap; } /** Set the features map*/ @Override public void setFeatures(FeatureMap features){this.features = features;} /** Set the mime type*/ public void setMimeType(MimeType aMimeType){mimeType = aMimeType;} /** Gets the mime Type*/ public MimeType getMimeType(){return mimeType;} /** * Utility method to get a {@link MimeType} given the type string. */ public static MimeType getMimeTypeForString(String typeString) { return mimeString2mimeTypeMap.get(typeString); } /** * Utility method to get the set of all file suffixes that are registered * with this class. */ public static Set getSupportedFileSuffixes() { return Collections.unmodifiableSet(suffixes2mimeTypeMap.keySet()); } //StatusReporter Implementation public synchronized void removeStatusListener(StatusListener l) { if (statusListeners != null && statusListeners.contains(l)) { @SuppressWarnings("unchecked") Vector v = (Vector) statusListeners.clone(); v.removeElement(l); statusListeners = v; } } public synchronized void addStatusListener(StatusListener l) { @SuppressWarnings("unchecked") Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone(); if (!v.contains(l)) { v.addElement(l); statusListeners = v; } } protected void fireStatusChanged(String e) { if (statusListeners != null) { int count = statusListeners.size(); for (int i = 0; i < count; i++) { statusListeners.elementAt(i).statusChanged(e); } } } } // class DocumentFormat




© 2015 - 2024 Weber Informatics LLC | Privacy Policy