
eu.interedition.text.xml.XMLTransformer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of text-core Show documentation
Show all versions of text-core Show documentation
Stand-off Markup/Annotation Text Model
The newest version!
/*
* #%L
* Text: A text model with range-based markup via standoff annotations.
* %%
* Copyright (C) 2010 - 2011 The Interedition Development Group
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package eu.interedition.text.xml;
import com.google.common.base.Throwables;
import com.google.common.io.Closeables;
import com.google.common.io.FileBackedOutputStream;
import eu.interedition.text.Layer;
import eu.interedition.text.Text;
import eu.interedition.text.TextConstants;
import eu.interedition.text.TextRange;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.List;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
/**
* @author Gregor Middell
*/
public class XMLTransformer {
private final Logger LOG = Logger.getLogger(getClass().getName());
private final XMLInputFactory xmlInputFactory = XML.createXMLInputFactory();
private final XMLTransformerConfiguration configuration;
private final List> modules;
private final Stack elementContext = new Stack();
private final Stack inclusionContext = new Stack();
private final Stack spacePreservationContext = new Stack();
private final XMLNodePath nodePath = new XMLNodePath();
private Layer source;
private Layer target;
private FileBackedOutputStream textBuffer;
private long textStartOffset;
private char lastChar;
private long sourceOffset;
private long textOffset;
private TextRange sourceOffsetRange;
private TextRange textOffsetRange;
public XMLTransformer(XMLTransformerConfiguration configuration) {
this.configuration = configuration;
this.modules = configuration.getModules();
}
public Layer transform(final Layer source) throws IOException, XMLStreamException {
this.source = source;
this.target = configuration.start(source);
try {
source.stream(new Text.Consumer() {
@Override
public void consume(Reader sourceReader) throws IOException {
XMLStreamReader xmlStream = null;
try {
xmlStream = xmlInputFactory.createXMLStreamReader(sourceReader);
final Stack entities = new Stack();
start();
while (xmlStream.hasNext()) {
final int event = xmlStream.next();
mapOffsetDelta(0, xmlStream.getLocation().getCharacterOffset() - sourceOffset);
switch (event) {
case XMLStreamConstants.START_ELEMENT:
endText();
nextSibling();
start(entities.push(XMLEntity.newElement(xmlStream)));
break;
case XMLStreamConstants.END_ELEMENT:
endText();
end(entities.pop());
break;
case XMLStreamConstants.COMMENT:
endText();
nextSibling();
emptyEntity(XMLEntity.newComment(xmlStream));
break;
case XMLStreamConstants.PROCESSING_INSTRUCTION:
endText();
nextSibling();
emptyEntity(XMLEntity.newPI(xmlStream));
break;
case XMLStreamConstants.CHARACTERS:
case XMLStreamConstants.ENTITY_REFERENCE:
case XMLStreamConstants.CDATA:
newText(xmlStream.getText());
break;
}
}
end();
} catch (XMLStreamException e) {
throw Throwables.propagate(e);
} finally {
XML.closeQuietly(xmlStream);
}
}
});
Reader textReader = null;
try {
configuration.end(target, textReader = read());
return target;
} finally {
Closeables.close(textReader, false);
}
} catch (Throwable t) {
Throwables.propagateIfInstanceOf(t, IOException.class);
Throwables.propagateIfInstanceOf(Throwables.getRootCause(t), XMLStreamException.class);
throw Throwables.propagate(t);
}
}
public Layer getSource() {
return source;
}
public Layer getTarget() {
return target;
}
public XMLTransformerConfiguration getConfiguration() {
return configuration;
}
public List> getModules() {
return Collections.unmodifiableList(modules);
}
public Stack getInclusionContext() {
return inclusionContext;
}
public boolean isIncluded() {
return inclusionContext.isEmpty() || inclusionContext.peek();
}
public Stack getSpacePreservationContext() {
return spacePreservationContext;
}
public boolean isSpacePreserved() {
return !spacePreservationContext.isEmpty() && spacePreservationContext.peek();
}
public Stack getElementContext() {
return elementContext;
}
public boolean isContainerElement() {
return !elementContext.isEmpty() && configuration.isContainerElement(elementContext.peek());
}
public boolean isLineElement() {
return !elementContext.isEmpty() && configuration.isLineElement(elementContext.peek());
}
public boolean isNotable() {
return !elementContext.isEmpty() && configuration.isNotable(elementContext.peek());
}
public XMLNodePath getNodePath() {
return nodePath;
}
public long getTextOffset() {
return textOffset;
}
public long getSourceOffset() {
return sourceOffset;
}
public long getTextStartOffset() {
return textStartOffset;
}
public void write(String text, boolean fromSource) {
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Inserting Text: '" + text.replaceAll("[\r\n]+", "\\\\n") + "' (" + (fromSource ? "from source" : "generated") + ")");
}
try {
final int textLength = text.length();
final StringBuilder inserted = new StringBuilder();
if (fromSource) {
final boolean preserveSpace = isSpacePreserved();
for (int cc = 0; cc < textLength; cc++) {
char currentChar = text.charAt(cc);
if (!preserveSpace && configuration.isCompressingWhitespace() && Character.isWhitespace(lastChar) && Character.isWhitespace(currentChar)) {
mapOffsetDelta(0, 1);
continue;
}
if (currentChar == '\n' || currentChar == '\r') {
currentChar = ' ';
}
textBuffer.write(Character.toString(lastChar = currentChar).getBytes());
inserted.append(lastChar);
mapOffsetDelta(1, 1);
}
} else {
textBuffer.write(text.getBytes());
inserted.append(text);
mapOffsetDelta(inserted.length(), 0);
}
final String insertedStr = inserted.toString();
for (XMLTransformerModule m : configuration.getModules()) {
m.textWritten(this, text, insertedStr);
}
} catch (IOException e) {
throw Throwables.propagate(e);
}
}
Reader read() throws IOException {
textBuffer.flush();
return new InputStreamReader(textBuffer.getSupplier().getInput());
}
void start() {
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Start of document");
}
elementContext.clear();
inclusionContext.clear();
spacePreservationContext.clear();
nodePath.clear();
textBuffer = new FileBackedOutputStream(configuration.getTextBufferSize(), true);
textStartOffset = -1;
lastChar = (configuration.isRemoveLeadingWhitespace() ? ' ' : 0);
sourceOffset = 0;
textOffset = 0;
sourceOffsetRange = TextRange.NULL;
textOffsetRange = TextRange.NULL;
this.nodePath.push(0);
for (XMLTransformerModule m : modules) {
m.start(this);
}
}
void end() {
emitOffsetMapping();
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("End of document");
}
for (XMLTransformerModule m : modules) {
m.end(this);
}
this.nodePath.pop();
}
void start(XMLEntity entity) {
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Start of " + entity);
}
final boolean parentIncluded = (inclusionContext.isEmpty() ? true : inclusionContext.peek());
inclusionContext.push(parentIncluded ? !configuration.excluded(entity) : configuration.included(entity));
spacePreservationContext.push(spacePreservationContext.isEmpty() ? false : spacePreservationContext.peek());
final Object xmlSpace = entity.getAttributes().get(TextConstants.XML_SPACE_ATTR_NAME);
if (xmlSpace != null) {
spacePreservationContext.pop();
spacePreservationContext.push("preserve".equalsIgnoreCase(xmlSpace.toString()));
}
nodePath.set(entity.getAttributes());
nodePath.push(0);
elementContext.push(entity);
for (XMLTransformerModule m : modules) {
m.start(this, entity);
}
}
void end(XMLEntity entity) {
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("End of " + entity);
}
for (XMLTransformerModule m : modules) {
m.end(this, entity);
}
elementContext.pop();
nodePath.pop();
spacePreservationContext.pop();
inclusionContext.pop();
}
void emptyEntity(XMLEntity entity) {
start(entity);
end(entity);
}
void nextSibling() {
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Next sibling");
}
nodePath.push(nodePath.pop() + 1);
}
void endText() {
if (textStartOffset >= 0 && textOffset > textStartOffset) {
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("End of text node");
}
for (XMLTransformerModule m : modules) {
m.endText(this);
}
}
textStartOffset = -1;
}
void newText(String text) throws IOException {
if (textStartOffset < 0) {
nextSibling();
textStartOffset = textOffset;
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Start of text node");
}
for (XMLTransformerModule m : modules) {
m.startText(this);
}
}
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Text: '" + text.replaceAll("[\r\n]+", "\\\\n") + "'");
}
for (XMLTransformerModule m : modules) {
m.text(this, text);
}
}
void mapOffsetDelta(long addToText, long addToSource) {
if (addToText == 0 && addToSource == 0) {
return;
}
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("Moving offsets: text += " + addToText + "; source += " + addToSource);
}
final long textOffsetRangeLength = textOffsetRange.length();
final long sourceOffsetRangeLength = sourceOffsetRange.length();
if (addToText == 0 && textOffsetRangeLength == 0) {
sourceOffsetRange = new TextRange(sourceOffsetRange.getStart(), sourceOffsetRange.getEnd() + addToSource);
} else if (addToSource == 0 && sourceOffsetRangeLength == 0) {
textOffsetRange = new TextRange(textOffsetRange.getStart(), textOffsetRange.getEnd() + addToText);
} else if (textOffsetRangeLength == sourceOffsetRangeLength && addToText == addToSource) {
sourceOffsetRange = new TextRange(sourceOffsetRange.getStart(), sourceOffsetRange.getEnd() + addToSource);
textOffsetRange = new TextRange(textOffsetRange.getStart(), textOffsetRange.getEnd() + addToText);
} else {
emitOffsetMapping();
sourceOffsetRange = new TextRange(sourceOffsetRange.getEnd(), sourceOffsetRange.getEnd() + addToSource);
textOffsetRange = new TextRange(textOffsetRange.getEnd(), textOffsetRange.getEnd() + addToText);
}
this.textOffset += addToText;
this.sourceOffset += addToSource;
}
void emitOffsetMapping() {
if (textOffsetRange.length() == 0 && sourceOffsetRange.length() == 0) {
return;
}
if (LOG.isLoggable(Level.FINER)) {
LOG.finer("New offset mapping: text = " + textOffsetRange + "==> source += " + sourceOffsetRange);
}
for (XMLTransformerModule m : modules) {
m.offsetMapping(this, textOffsetRange, sourceOffsetRange);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy