![JAR search and dependency download from the Maven repository](/logo.png)
org.cloudgraph.mapreduce.GraphXmlRecordReader Maven / Gradle / Ivy
/**
* Copyright 2017 TerraMeta Software, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cloudgraph.mapreduce;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.lang.reflect.Method;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.StringUtils;
import org.cloudgraph.common.CloudGraphConstants;
import org.cloudgraph.store.service.MetricCollector;
import org.plasma.sdo.core.CoreDataObject;
import org.plasma.sdo.xml.DefaultOptions;
import org.plasma.sdo.xml.StreamUnmarshaller;
import org.plasma.sdo.xml.UnmarshallerException;
import commonj.sdo.DataGraph;
import commonj.sdo.helper.XMLDocument;
/**
* An HDFS XML text file record reader that iterates over HDFS data for the
* current TableSplit
, unmarshalling the XML as structured data
* graphs based on structural and XML-specific metadata from the underlying
* domain model. Data graphs may be heterogeneous and of any size or complexity
* are supplied through {@link GraphXmlInputFormat} including graphs where the
* underlying domain model contains instances of multiple inheritance. The
* unmarshalling is stream oriented and leverages the XML (StAX) parser based
* Plasma StreamUnmarshaller.
*
* Several job {@link Counters} are set up which accumulate various metrics
* related to the resulting graph, the time taken to unmarshal the XML in
* addition to other metrics.
*
*
* This XML text file record reader is "line oriented" such that every HDFS line
* is assumed to be a single data graph marshalled as XML. Below is an example
* where a given data graph is being serialized as a single line.
*
*
* protected byte[] marshal(DataGraph graph) throws IOException {
* DefaultOptions options = new DefaultOptions(graph.getRootObject().getType().getURI());
* options.setRootNamespacePrefix("c");
* options.setPrettyPrint(false);
* XMLDocument doc = PlasmaXMLHelper.INSTANCE.createDocument(graph.getRootObject(), graph
* .getRootObject().getType().getURI(), null);
* doc.setXMLDeclaration(false);
* ByteArrayOutputStream os = new ByteArrayOutputStream();
* PlasmaXMLHelper.INSTANCE.save(doc, os, options);
* os.close();
* return os.toByteArray();
* }
*
*
* @see org.cloudgraph.mapreduce.GraphWritable
* @see org.cloudgraph.mapreduce.GraphXmlInputFormat
*
* @author Scott Cinnamond
* @since 0.5.8
*/
public class GraphXmlRecordReader extends RecordReader {
static final Log log = LogFactory.getLog(GraphXmlRecordReader.class);
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key = new LongWritable();
private GraphWritable value = null;
private Configuration configuration;
private String rootNamespaceUri;
private String rootNamespacePrefix;
private DefaultOptions unmarshalOptions;
private StreamUnmarshaller unmarshaler;
private TaskAttemptContext context;
private Method getCounter = null;
private long totalGraphNodesAssembled = 0;
private long totalGraphUnmarshalTime = 0;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
InterruptedException {
// This InputSplit is a FileInputSplit
FileSplit split = (FileSplit) inputSplit;
this.context = context;
this.configuration = context.getConfiguration();
this.getCounter = Counters.retrieveGetCounterWithStringsParams(context);
this.rootNamespaceUri = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_URI);
this.rootNamespacePrefix = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_PREFIX,
"ns1");
this.unmarshalOptions = new DefaultOptions(this.rootNamespaceUri);
this.unmarshalOptions.setRootNamespacePrefix(this.rootNamespacePrefix);
this.unmarshalOptions.setValidate(false);
this.unmarshalOptions.setFailOnValidationError(false);
this.unmarshaler = new StreamUnmarshaller(this.unmarshalOptions, null);
// Retrieve configuration, and Max allowed
// bytes for a single record
this.maxLineLength = configuration.getInt("mapred.linerecordreader.maxlength",
Integer.MAX_VALUE);
// Split "S" is responsible for all records
// starting from "start" and "end" positions
start = split.getStart();
end = start + split.getLength();
// Retrieve file containing Split "S"
final Path file = split.getPath();
FileSystem fs = file.getFileSystem(this.configuration);
FSDataInputStream fileIn = fs.open(split.getPath());
// If Split "S" starts at byte 0, first line will be processed
// If Split "S" does not start at byte 0, first line has been already
// processed by "S-1" and therefore needs to be silently ignored
boolean skipFirstLine = false;
if (start != 0) {
skipFirstLine = true;
// Set the file pointer at "start - 1" position.
// This is to make sure we won't miss any line
// It could happen if "start" is located on a EOL
--start;
fileIn.seek(start);
}
in = new LineReader(fileIn, this.configuration);
// If first line needs to be skipped, read first line
// and stores its content to a dummy Text
if (skipFirstLine) {
Text dummy = new Text();
// Reset "start" to "start + line offset"
start += in.readLine(dummy, 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
}
// Position is the actual start
this.pos = start;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// Current offset is the key
key.set(pos);
int newSize = 0;
// Make sure we get at least one record that starts in this Split
while (pos < end) {
Text text = new Text();
// Read first line and store its content to "value"
newSize = in.readLine(text, maxLineLength,
Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
DataGraph graph = this.unmarshal(text);
this.value = new GraphWritable(graph);
updateCounters();
// No byte read, seems that we reached end of Split
// Break and return false (no key / value)
if (newSize == 0) {
break;
}
// Line is read, new position is set
pos += newSize;
// Line is lower than Maximum record line size
// break and return true (found key / value)
if (newSize < maxLineLength) {
break;
}
// Line is too long
// Try again with position = position + line offset,
// i.e. ignore line and go to next one
log.error("Skipped line of size " + newSize + " at pos " + (pos - newSize));
}
if (newSize == 0) {
// We've reached end of Split
key = null;
value = null;
return false;
} else {
// Tell Hadoop a new line has been found
// key / value will be retrieved by
// getCurrentKey getCurrentValue methods
return true;
}
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public GraphWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
}
@Override
public void close() throws IOException {
if (in != null) {
in.close();
}
}
/**
* Updates various job counters.
*
* @throws IOException
*/
private void updateCounters() throws IOException {
// we can get access to counters only if hbase uses new mapreduce APIs
if (this.getCounter == null) {
return;
}
try {
((Counter) this.getCounter.invoke(context, Counters.CLOUDGRAPH_COUNTER_GROUP_NAME,
Counters.CLOUDGRAPH_COUNTER_NAME_NUM_GRAPH_NODES_ASSEMBLED))
.increment(this.totalGraphNodesAssembled);
((Counter) this.getCounter.invoke(context, Counters.CLOUDGRAPH_COUNTER_GROUP_NAME,
Counters.CLOUDGRAPH_COUNTER_NAME_TOT_GRAPH_XML_UNMARSHAL_TIME))
.increment(this.totalGraphUnmarshalTime);
} catch (Exception e) {
log.debug("can't update counter." + StringUtils.stringifyException(e));
}
}
/**
* Deserializes the given text/xml and unmarshalling it as a data graph,
* capturing various metrics and returning the new graph. The given text/xml
* represents a single line in the underlying HDFS file and is assumed to be
* an XML serialized data graph.
*
* @param text
* the input text
* @return the new graph
* @throws IOException
*/
private DataGraph unmarshal(Text text) throws IOException {
long before = System.currentTimeMillis();
String textString = text.toString();
ByteArrayInputStream xmlloadis = new ByteArrayInputStream(textString.getBytes("UTF-8"));
try {
this.unmarshaler.unmarshal(xmlloadis);
} catch (XMLStreamException e) {
throw new IOException(e);
} catch (UnmarshallerException e) {
throw new IOException(e);
}
XMLDocument doc = this.unmarshaler.getResult();
doc.setNoNamespaceSchemaLocation(null);
long after = System.currentTimeMillis();
CoreDataObject root = (CoreDataObject) doc.getRootObject();
MetricCollector visitor = new MetricCollector();
root.accept(visitor);
root.setValue(CloudGraphConstants.GRAPH_ASSEMBLY_TIME, Long.valueOf(after - before));
root.setValue(CloudGraphConstants.GRAPH_NODE_COUNT, Long.valueOf(visitor.getCount()));
root.setValue(CloudGraphConstants.GRAPH_DEPTH, Long.valueOf(visitor.getDepth()));
Long time = (Long) root.getValue(CloudGraphConstants.GRAPH_ASSEMBLY_TIME);
this.totalGraphUnmarshalTime = time.longValue();
Long nodeCount = (Long) root.getValue(CloudGraphConstants.GRAPH_NODE_COUNT);
this.totalGraphNodesAssembled = nodeCount.longValue();
return doc.getRootObject().getDataGraph();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy