Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* DocumentContentImpl.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Hamish Cunningham, 11/Feb/2000
*
* $Id: DocumentContentImpl.java 19642 2016-10-06 09:52:06Z markagreenwood $
*/
package gate.corpora;
import gate.DocumentContent;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
/** Represents the commonalities between all sorts of document contents.
*/
public class DocumentContentImpl implements DocumentContent
{
/** Buffer size for reading
* 16k is 4 times the block size on most filesystems
* so it should be efficient for most cases
* */
private static final int INTERNAL_BUFFER_SIZE = 16*1024;
/** Default construction */
public DocumentContentImpl() {
content = "";
} // default construction
/** Contruction from URL and offsets. */
public DocumentContentImpl(URL u, String encoding, Long start, Long end)
throws IOException {
int readLength = 0;
char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
BufferedReader uReader = null;
InputStream uStream = null;
StringBuffer buf = new StringBuffer();
long s = 0, e = Long.MAX_VALUE;
if(start != null && end != null) {
s = start.longValue();
e = end.longValue();
}
try {
URLConnection conn = u.openConnection();
uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
if(encoding != null && !encoding.equalsIgnoreCase("")) {
uReader = new BomStrippingInputStreamReader(uStream, encoding, INTERNAL_BUFFER_SIZE);
} else {
uReader = new BomStrippingInputStreamReader(uStream, INTERNAL_BUFFER_SIZE);
};
// 1. skip S characters
uReader.skip(s);
// 2. how many character shall I read?
long toRead = e - s;
// 3. read gtom source into buffer
while (
toRead > 0 &&
(readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
) {
if (toRead < readLength) {
//well, if toRead(long) is less than readLenght(int)
//then there can be no overflow, so the cast is safe
readLength = (int)toRead;
}
buf.append(readBuffer, 0, readLength);
toRead -= readLength;
}
}
finally {
// 4.close reader
IOUtils.closeQuietly(uReader);
IOUtils.closeQuietly(uStream);
}
content = new String(buf);
originalContent = content;
} // Contruction from URL and offsets
/** Propagate changes to the document content. */
void edit(Long start, Long end, DocumentContent replacement)
{
int s = start.intValue(), e = end.intValue();
String repl = replacement == null ? "" :
((DocumentContentImpl) replacement).content;
StringBuffer newContent = new StringBuffer(content);
newContent.replace(s, e, repl);
content = newContent.toString();
} // edit(start,end,replacement)
@Override
public DocumentContent getContent(Long start, Long end)
throws InvalidOffsetException
{
if(! isValidOffsetRange(start, end))
throw new InvalidOffsetException("Invalid offset range "+start+" to "+end+
" for document content of length "+this.size());
return new DocumentContentImpl(
content.substring(start.intValue(), end.intValue())
);
} // getContent(start, end)
/** Returns the String representing the content in case of a textual document.
* NOTE: this is a temporary solution until we have a more generic one.
*/
@Override
public String toString(){
return content;
}
/** The size of this content (e.g. character length for textual
* content).
*/
@Override
public Long size() {
return Long.valueOf(content.length());
} // size()
/** Check that an offset is valid */
boolean isValidOffset(Long offset) {
if(offset == null)
return false;
long o = offset.longValue();
long len = content.length();
if(o > len || o < 0)
return false;
return true;
} // isValidOffset
/** Check that both start and end are valid offsets and that
* they constitute a valid offset range
*/
boolean isValidOffsetRange(Long start, Long end) {
return
isValidOffset(start) && isValidOffset(end) &&
start.longValue() <= end.longValue();
} // isValidOffsetRange(start,end)
/** Two documents are the same if their contents is the same
*/
@Override
public boolean equals(Object other) {
if (!(other instanceof DocumentContentImpl)) return false;
DocumentContentImpl docImpl = (DocumentContentImpl) other;
return content.equals(docImpl.toString());
} // equals
/** Calculate the hash value for the object. */
@Override
public int hashCode(){ return toString().hashCode(); }
/** Just for now - later we have to cater for different types of
* content.
*/
String content;
/**
* For preserving the original content of the document.
* The edit command didn't affect on the original content.
* If you construct the content by URL the originalContent will keep
* whole information retrieved by URL even you set some start and end.
*/
String originalContent;
/**
* Return the original content of the document received during the loading
* phase or on construction from string.
*/
public String getOriginalContent() { return originalContent; }
/** For ranges */
public DocumentContentImpl(String s)
{ content = s; originalContent = content; }
/** Freeze the serialization UID. */
static final long serialVersionUID = -1426940535575467461L;
} // class DocumentContentImpl