gate.corpora.DocumentContentImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
The newest version!
/*
* DocumentContentImpl.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Hamish Cunningham, 11/Feb/2000
*
* $Id: DocumentContentImpl.java 19642 2016-10-06 09:52:06Z markagreenwood $
*/
package gate.corpora;
import gate.DocumentContent;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
/** Represents the commonalities between all sorts of document contents.
*/
public class DocumentContentImpl implements DocumentContent
{
/** Buffer size for reading
* 16k is 4 times the block size on most filesystems
* so it should be efficient for most cases
* */
private static final int INTERNAL_BUFFER_SIZE = 16*1024;
/** Default construction */
public DocumentContentImpl() {
content = "";
} // default construction
/** Contruction from URL and offsets. */
public DocumentContentImpl(URL u, String encoding, Long start, Long end)
throws IOException {
int readLength = 0;
char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
BufferedReader uReader = null;
InputStream uStream = null;
StringBuffer buf = new StringBuffer();
long s = 0, e = Long.MAX_VALUE;
if(start != null && end != null) {
s = start.longValue();
e = end.longValue();
}
try {
URLConnection conn = u.openConnection();
uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
if(encoding != null && !encoding.equalsIgnoreCase("")) {
uReader = new BomStrippingInputStreamReader(uStream, encoding, INTERNAL_BUFFER_SIZE);
} else {
uReader = new BomStrippingInputStreamReader(uStream, INTERNAL_BUFFER_SIZE);
};
// 1. skip S characters
uReader.skip(s);
// 2. how many character shall I read?
long toRead = e - s;
// 3. read gtom source into buffer
while (
toRead > 0 &&
(readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
) {
if (toRead < readLength) {
//well, if toRead(long) is less than readLenght(int)
//then there can be no overflow, so the cast is safe
readLength = (int)toRead;
}
buf.append(readBuffer, 0, readLength);
toRead -= readLength;
}
}
finally {
// 4.close reader
IOUtils.closeQuietly(uReader);
IOUtils.closeQuietly(uStream);
}
content = new String(buf);
originalContent = content;
} // Contruction from URL and offsets
/** Propagate changes to the document content. */
void edit(Long start, Long end, DocumentContent replacement)
{
int s = start.intValue(), e = end.intValue();
String repl = replacement == null ? "" :
((DocumentContentImpl) replacement).content;
StringBuffer newContent = new StringBuffer(content);
newContent.replace(s, e, repl);
content = newContent.toString();
} // edit(start,end,replacement)
@Override
public DocumentContent getContent(Long start, Long end)
throws InvalidOffsetException
{
if(! isValidOffsetRange(start, end))
throw new InvalidOffsetException("Invalid offset range "+start+" to "+end+
" for document content of length "+this.size());
return new DocumentContentImpl(
content.substring(start.intValue(), end.intValue())
);
} // getContent(start, end)
/** Returns the String representing the content in case of a textual document.
* NOTE: this is a temporary solution until we have a more generic one.
*/
@Override
public String toString(){
return content;
}
/** The size of this content (e.g. character length for textual
* content).
*/
@Override
public Long size() {
return Long.valueOf(content.length());
} // size()
/** Check that an offset is valid */
boolean isValidOffset(Long offset) {
if(offset == null)
return false;
long o = offset.longValue();
long len = content.length();
if(o > len || o < 0)
return false;
return true;
} // isValidOffset
/** Check that both start and end are valid offsets and that
* they constitute a valid offset range
*/
boolean isValidOffsetRange(Long start, Long end) {
return
isValidOffset(start) && isValidOffset(end) &&
start.longValue() <= end.longValue();
} // isValidOffsetRange(start,end)
/** Two documents are the same if their contents is the same
*/
@Override
public boolean equals(Object other) {
if (!(other instanceof DocumentContentImpl)) return false;
DocumentContentImpl docImpl = (DocumentContentImpl) other;
return content.equals(docImpl.toString());
} // equals
/** Calculate the hash value for the object. */
@Override
public int hashCode(){ return toString().hashCode(); }
/** Just for now - later we have to cater for different types of
* content.
*/
String content;
/**
* For preserving the original content of the document.
* The edit command didn't affect on the original content.
* If you construct the content by URL the originalContent will keep
* whole information retrieved by URL even you set some start and end.
*/
String originalContent;
/**
* Return the original content of the document received during the loading
* phase or on construction from string.
*/
public String getOriginalContent() { return originalContent; }
/** For ranges */
public DocumentContentImpl(String s)
{ content = s; originalContent = content; }
/** Freeze the serialization UID. */
static final long serialVersionUID = -1426940535575467461L;
} // class DocumentContentImpl