All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.corpora.DocumentContentImpl Maven / Gradle / Ivy

Go to download

GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.

The newest version!
/*
 *  DocumentContentImpl.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Hamish Cunningham, 11/Feb/2000
 *
 *  $Id: DocumentContentImpl.java 19642 2016-10-06 09:52:06Z markagreenwood $
 */

package gate.corpora;

import gate.DocumentContent;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;

/** Represents the commonalities between all sorts of document contents.
  */
public class DocumentContentImpl implements DocumentContent
{
  /** Buffer size for reading
   *  16k is 4 times the block size on most filesystems
   *  so it should be efficient for most cases
   *  */
  private static final int INTERNAL_BUFFER_SIZE  = 16*1024;

  /** Default construction */
  public DocumentContentImpl() {
    content = "";
  } // default construction

  /** Contruction from URL and offsets. */
  public DocumentContentImpl(URL u, String encoding, Long start, Long end)
  throws IOException {

    int readLength = 0;
    char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];

    BufferedReader uReader = null;
    InputStream uStream = null;
    StringBuffer buf = new StringBuffer();

    long s = 0, e = Long.MAX_VALUE;
    if(start != null && end != null) {
      s = start.longValue();
      e = end.longValue();
    }

    try {
      URLConnection conn = u.openConnection();
      uStream = conn.getInputStream();
            
      if ("gzip".equals(conn.getContentEncoding())) {
        uStream = new GZIPInputStream(uStream);
      }
      
      if(encoding != null && !encoding.equalsIgnoreCase("")) {
        uReader = new BomStrippingInputStreamReader(uStream, encoding, INTERNAL_BUFFER_SIZE);
      } else {
        uReader = new BomStrippingInputStreamReader(uStream, INTERNAL_BUFFER_SIZE);
      };
  
      // 1. skip S characters
      uReader.skip(s);
  
      // 2. how many character shall I read?
      long toRead = e - s;
  
      // 3. read gtom source into buffer
      while (
        toRead > 0 &&
        (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
      ) {
        if (toRead <  readLength) {
          //well, if toRead(long) is less than readLenght(int)
          //then there can be no overflow, so the cast is safe
          readLength = (int)toRead;
        }
  
        buf.append(readBuffer, 0, readLength);
        toRead -= readLength;
      }
    }
    finally {
      // 4.close reader
      IOUtils.closeQuietly(uReader);
      IOUtils.closeQuietly(uStream);
    }

    content = new String(buf);
    originalContent = content;
  } // Contruction from URL and offsets

  /** Propagate changes to the document content. */
  void edit(Long start, Long end, DocumentContent replacement)
  {
    int s = start.intValue(), e = end.intValue();
    String repl = replacement == null ? "" :
      ((DocumentContentImpl) replacement).content;
    StringBuffer newContent = new StringBuffer(content);
    newContent.replace(s, e, repl);
    content = newContent.toString();
  } // edit(start,end,replacement)

  @Override
  public DocumentContent getContent(Long start, Long end)
    throws InvalidOffsetException
  {
    if(! isValidOffsetRange(start, end))
      throw new InvalidOffsetException("Invalid offset range "+start+" to "+end+
              " for document content of length "+this.size());

    return new DocumentContentImpl(
      content.substring(start.intValue(), end.intValue())
    );
  } // getContent(start, end)

  /** Returns the String representing the content in case of a textual document.
    * NOTE: this is a temporary solution until we have a more generic one.
    */
  @Override
  public String toString(){
    return content;
  }

  /** The size of this content (e.g. character length for textual
    * content).
    */
  @Override
  public Long size() {
    return Long.valueOf(content.length());
  } // size()

  /** Check that an offset is valid */
  boolean isValidOffset(Long offset) {
    if(offset == null)
      return false;

    long o = offset.longValue();
    long len = content.length();
    if(o > len || o < 0)
      return false;

    return true;
  } // isValidOffset

  /** Check that both start and end are valid offsets and that
    * they constitute a valid offset range
    */
  boolean isValidOffsetRange(Long start, Long end) {
    return
      isValidOffset(start) && isValidOffset(end) &&
      start.longValue() <= end.longValue();
  } // isValidOffsetRange(start,end)

  /** Two documents are the same if their contents is the same
   */
  @Override
  public boolean equals(Object other) {
    if (!(other instanceof DocumentContentImpl)) return false;

    DocumentContentImpl docImpl = (DocumentContentImpl) other;
    return content.equals(docImpl.toString());
  } // equals

  /** Calculate the hash value for the object. */
  @Override
  public int hashCode(){ return toString().hashCode(); }

  /** Just for now - later we have to cater for different types of
    * content.
    */
  String content;

  /**
   * For preserving the original content of the document.
   * The edit command didn't affect on the original content.
   * If you construct the content by URL the originalContent will keep
   * whole information retrieved by URL even you set some start and end.
   */
  String originalContent;

  /**
   * Return the original content of the document received during the loading
   * phase or on construction from string.
   */
  public String getOriginalContent() { return originalContent; }

  /** For ranges */
  public DocumentContentImpl(String s)
    { content = s; originalContent = content; }

  /** Freeze the serialization UID. */
  static final long serialVersionUID = -1426940535575467461L;
} // class DocumentContentImpl




© 2015 - 2024 Weber Informatics LLC | Privacy Policy