
com.digitalpebble.behemoth.BehemothDocument Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.behemoth;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import com.digitalpebble.behemoth.util.MimeUtil;
/**
* Implementation of a Document using Hadoop primitives. A BehemothDocument
* consists of a URL, content type, binary content, metadata and @class
* Annotations.
***/
public class BehemothDocument implements Writable {
public BehemothDocument() {
}
private String url;
private String contentType;
private final static byte CUR_VERSION = 1;
/**
* Text representation of a document - can be null if the document is at a
* binary format and has not yet been converted; or if the document had
* multimedia content
**/
private String text;
/** Binary content from which the text can be extracted **/
private byte[] content;
/** Document metadata **/
private MapWritable metadata;
/** List holding the annotations **/
private List annotations;
/** Returns the text of the document if it has been set or null **/
public String getText() {
return text;
}
/** Sets the text representation for this document **/
public void setText(String text) {
this.text = text;
}
/** Returns the binary content of the document if it has been set or null **/
public byte[] getContent() {
return content;
}
/** Sets the binary content for this document **/
public void setContent(byte[] content) {
this.content = content;
}
/** Returns the metadata or null if it has not been set **/
public MapWritable getMetadata() {
return metadata;
}
/** Returns the Metadata or a new MapWritable if it has not been set **/
public MapWritable getMetadata(boolean create) {
if (metadata == null && create)
metadata = new MapWritable();
return getMetadata();
}
/** Sets the metadata for this document **/
public void setMetadata(MapWritable metadata) {
this.metadata = metadata;
}
/** Returns the list of Annotations if set or an empty List otherwise **/
public List getAnnotations() {
if (annotations == null)
annotations = new ArrayList();
return annotations;
}
/** Sets the annotations for this document **/
public void setAnnotations(List annotations) {
this.annotations = annotations;
}
/** Returns the URL for this document or null **/
public String getUrl() {
return url;
}
/** Sets the URL for this document **/
public void setUrl(String url) {
this.url = url;
}
/** Returns the content type for this document or null **/
public String getContentType() {
return contentType;
}
/** Sets the content type for this document **/
public void setContentType(String contentType) {
// make sure that the mime type does not contain any
// charset info
this.contentType = MimeUtil.cleanMimeType(contentType);
}
public final void readFields(DataInput in) throws IOException {
byte version = in.readByte(); // read version
if (version > CUR_VERSION) // check version
throw new VersionMismatchException(CUR_VERSION, version);
url = Text.readString(in);
int contentLength = in.readInt();
content = new byte[contentLength];
if (contentLength > 0)
in.readFully(content);
contentType = Text.readString(in);
boolean hasText = in.readBoolean();
if (hasText)
text = Text.readString(in);
else
text = null;
boolean hasMD = in.readBoolean();
if (hasMD) {
metadata = new MapWritable();
metadata.readFields(in);
} else
metadata = null;
// read the number of annotation types
int numTypes = in.readInt();
ArrayList types = null;
if (numTypes > 0) {
types = new ArrayList(numTypes);
for (int i = 0; i < numTypes; i++) {
types.add(Text.readString(in));
}
}
int numAnnots = in.readInt();
this.annotations = new ArrayList(numAnnots);
for (int i = 0; i < numAnnots; i++) {
Annotation annot = new Annotation();
readAnnotationFields(annot, in, types);
this.annotations.add(annot);
}
}
/** Serialization of a BehemothDocument **/
public void write(DataOutput out) throws IOException {
writeCommon(out);
writeAnnotations(out); // write annotations
}
public void writeCommon(DataOutput out) throws IOException {
out.writeByte(CUR_VERSION); // write version
Text.writeString(out, url); // write url
if (content == null)
out.writeInt(0); // write content
else {
out.writeInt(content.length); // write content
out.write(content);
}
if (contentType != null) {
Text.writeString(out, contentType); // write contentType
} else {
Text.writeString(out, "");
}
out.writeBoolean(text != null);
if (text != null)
Text.writeString(out, text); // write text
out.writeBoolean(metadata != null);
if (metadata != null)
metadata.write(out); // write metadata;
}
private void writeAnnotations(DataOutput out) throws IOException {
List atypes = new ArrayList();
if (annotations != null) {
// go through the annotations and check the annotation types that
// are present
for (int i = 0; i < annotations.size(); i++) {
Annotation annot = annotations.get(i);
if (atypes.contains(annot.getType()) == false)
atypes.add(annot.getType());
Iterator featNamIter = annot.getFeatures().keySet()
.iterator();
while (featNamIter.hasNext()) {
String fn = featNamIter.next();
if (atypes.contains(fn) == false)
atypes.add(fn);
}
}
}
out.writeInt(atypes.size());
// write the annotation type and feature names
// to the output
for (String type : atypes) {
Text.writeString(out, type);
}
// write annotations
if (annotations == null)
out.writeInt(0);
else
out.writeInt(annotations.size());
if (annotations != null) {
for (int i = 0; i < annotations.size(); i++) {
Annotation annot = annotations.get(i);
writeAnnotation(annot, out, atypes);
}
}
}
protected void writeAnnotation(Annotation annot, DataOutput out,
List atypes) throws IOException {
int typePos = atypes.indexOf(annot.getType());
IntWritable intStringPool = new IntWritable(typePos);
intStringPool.write(out);
WritableUtils.writeVLong(out, annot.getStart());
WritableUtils.writeVLong(out, annot.getEnd());
out.writeInt(annot.getFeatureNum());
if (annot.getFeatures() != null) {
Iterator featNameIter = annot.getFeatures().keySet()
.iterator();
while (featNameIter.hasNext()) {
String fname = featNameIter.next();
int fnamePos = atypes.indexOf(fname);
intStringPool.set(fnamePos);
intStringPool.write(out);
WritableUtils.writeString(out, annot.getFeatures().get(fname));
}
}
}
public void readAnnotationFields(Annotation annot, DataInput in,
List types) throws IOException {
IntWritable posType = new IntWritable();
posType.readFields(in);
annot.setType(types.get(posType.get()));
annot.setStart(WritableUtils.readVLong(in));
annot.setEnd(WritableUtils.readVLong(in));
HashMap features = null;
int numFeatures = in.readInt();
if (numFeatures > 0)
features = new HashMap(numFeatures);
for (int i = 0; i < numFeatures; i++) {
posType.readFields(in);
String fname = types.get(posType.get());
String fvalue = WritableUtils.readString(in);
features.put(fname, fvalue);
}
annot.setFeatures(features);
}
/** Deserialization of a BehemothDocument **/
public static BehemothDocument read(DataInput in) throws IOException {
BehemothDocument doc = new BehemothDocument();
doc.readFields(in);
return doc;
}
/**
* Returns a complete string representation of the document
**/
public String toString() {
return toString(true, true, true, true);
}
/**
* Returns a string representation of the document
*
* @param binaryContent
* whether to include the binary content
**/
public String toString(boolean binaryContent) {
return toString(binaryContent, true, true, true);
}
/**
* Returns a string representation of the document
*
* @param showContent
* whether to include the binary content
* @param showAnnotations
* whether to include the annotations content
* @param showText
* whether to include the text
* @param showMD
* whether to include the metadata
**/
public String toString(boolean showContent, boolean showAnnotations,
boolean showText, boolean showMD) {
StringBuffer buffer = new StringBuffer();
buffer.append("\nurl: ").append(url);
buffer.append("\ncontentType: ").append(contentType);
if (metadata != null && showMD) {
buffer.append("\nmetadata: ");
for (Entry e : metadata.entrySet()) {
buffer.append("\n\t");
buffer.append(e.getKey());
buffer.append(": ");
buffer.append(e.getValue());
}
}
if (showContent) {
buffer.append("\nContent:\n");
int maxLengthText = Math.min(200, content.length);
buffer.append(new String(Arrays.copyOfRange(content, 0,
maxLengthText)));
}
// try
// default
// encoding
if (this.text != null && showText) {
buffer.append("\nText:\n");
int maxLengthText = Math.min(200, text.length());
buffer.append(text.substring(0, maxLengthText));
}
if (annotations == null || !showAnnotations)
return buffer.toString();
buffer.append("\nAnnotations:\n");
for (Annotation ann : annotations) {
buffer.append("\t").append(ann.toString()).append("\n");
}
return buffer.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy