All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eigenbase.xom.wrappers.Annotator Maven / Gradle / Ivy

The newest version!
/*
// Licensed to Julian Hyde under one or more contributor license
// agreements. See the NOTICE file distributed with this work for
// additional information regarding copyright ownership.
//
// Julian Hyde licenses this file to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
package org.eigenbase.xom.wrappers;

import org.eigenbase.xom.*;
import org.w3c.dom.Node;

import java.util.*;
import java.io.PrintWriter;

/**
 * Quick and dirty XML parser that finds the precise start and end
 * position of all nodes in a document. Also finds all line endings, so
 * that character offsets can be converted to line/column positions.
 *
 * @author jhyde
 */
public class Annotator {
    private final List/**/ locInfoList = new ArrayList();
    private int[] lineStartPositions;
    private final String xml;
    private final Map/**/ wrapperLocMap =
        new HashMap();
    private final Map/**/ nodeLocMap = new HashMap();
    private int seq; // workspace for populateMap

    /**
     * Creates an Annotator.
     *
     * 

For testing purposes, wrapper may be null. Parses the XML * but does not build the mapping from location information to DOM nodes. * * @param xml XML source string * @param def Wrapper around root DOM node */ Annotator(String xml, DOMWrapper def) { this.xml = xml; parse(xml); if (def != null) { seq = 0; populateMap(def); assert this.nodeLocMap.size() == this.wrapperLocMap.size(); } } public Location getLocation(DOMWrapper wrapper) { LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper); if (location0 == null) { location0 = (Annotator.LocInfo) nodeLocMap.get(((W3CDOMWrapper) wrapper).node); if (location0 == null) { return null; } } final LocInfo location = location0; return new Location() { public int getStartLine() { return getLine(getStartPos()) + 1; } public int getStartColumn() { return getCol(getStartPos()) + 1; } public int getStartPos() { return location.startTagStartPos; } public int getEndLine() { return getLine(getEndPos()) + 1; } public int getEndColumn() { return getCol(getEndPos()) + 1; } public int getEndPos() { return location.endTagEndPos >= 0 ? location.endTagEndPos : location.startTagEndPos; } public String getText(boolean headOnly) { return location.getText(headOnly); } public String toString() { return location.toString(Annotator.this); } }; } /** * Returns the list of LocInfo. For testing. * * @return list of LocInfo. */ List getLocInfoList() { return locInfoList; } // enum State private static final int STATE_NORMAL = 0, STATE_TAG = 1, STATE_ENDTAG = 2, STATE_QUOT = 3, STATE_APOS = 4, STATE_COMMENT = 5, STATE_CDATA = 6; void parse(String s) { final ArrayStack/**/ lockInfoStack = new ArrayStack(); final List lineStartPositions = new ArrayList(); int state = STATE_NORMAL; final int count = s.length(); int i = 0; int last = 0; lineStartPositions.add(new Integer(i)); lockInfoStack.push(null); LocInfo location = null; loop: while (i < count) { final char c = s.charAt(i); switch (c) { case '<': stateSwitch: switch (state) { case STATE_NORMAL: if (i > last) { // Unlike other node types, we create the LocInfo // at the end of the element. No need to add the node // to the stack, because we'd just remove it again. LocInfo loc2 = new LocInfo(locInfoList.size(), TYPE_TEXT, last); loc2.endTagEndPos = i; locInfoList.add(loc2); } if (i + 1 < count) { final char c1 = s.charAt(i + 1); switch (c1) { case '/': // ^ state = STATE_ENDTAG; assert location != null; break stateSwitch; case '?': // ^ location = new LocInfo( locInfoList.size(), TYPE_PROCESSING_INSTRUCTION, i); locInfoList.add(location); state = STATE_TAG; i += " // Don't push until we see end of the head tag state = STATE_TAG; location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i); locInfoList.add(location); ++i; continue loop; } break; case '>': switch (state) { case STATE_TAG: ++i; assert location != null; switch (location.type) { case TYPE_PROCESSING_INSTRUCTION: // case TYPE_CDATA_SECTION: // case TYPE_COMMENT: // i += "-->".length(); location.endTagEndPos = i; last = i; location = (LocInfo) lockInfoStack.peek(); state = STATE_NORMAL; continue loop; } } break; case '\r': ++i; if (i < count && s.charAt(i) == '\n') { // only count windows line ending CR LF as one line ++i; } lineStartPositions.add(new Integer(i)); continue loop; case '\n': ++i; lineStartPositions.add(new Integer(i)); continue loop; case '\'': switch (state) { case STATE_APOS: // a='xxx^' state = STATE_TAG; break; case STATE_TAG: // a=^'xxx' state = STATE_APOS; break; case STATE_QUOT: // a="doesn^'t matter" default: break; } break; case '"': switch (state) { case STATE_QUOT: // a="xxx^" state = STATE_TAG; break; case STATE_TAG: // a=^"xxx" state = STATE_QUOT; break; case STATE_APOS: // a='doesn^"t matter' default: break; } break; } ++i; } this.lineStartPositions = new int[lineStartPositions.size()]; for (int j = 0; j < lineStartPositions.size(); j++) { this.lineStartPositions[j] = ((Integer) lineStartPositions.get(j)).intValue(); } } private void populateMap(DOMWrapper def) { final int defType = def.getType(); LocInfo location; while (true) { location = (LocInfo) locInfoList.get(seq++); if (defType == DOMWrapper.ELEMENT && location.type == TYPE_ELEMENT) { break; } if (defType == DOMWrapper.CDATA && location.type == TYPE_TEXT) { break; } if (seq >= locInfoList.size()) { return; } } wrapperLocMap.put(def, location); nodeLocMap.put(((W3CDOMWrapper) def).node, location); final DOMWrapper[] elementChildren = def.getElementChildren(); for (int i = 0; i < elementChildren.length; i++) { DOMWrapper domWrapper = elementChildren[i]; populateMap(domWrapper); } } /** * Returns the line that a character position falls on. The first line in a * document is numbered 0. * * @param pos Character position * @return Line (starting from 0) */ int getLine(int pos) { int index = Arrays.binarySearch(lineStartPositions, pos); if (index >= 0) { return index; } else { return -2 - index; } } /** * Returns the column that a character position falls on. The first column * in a line is numbered 0. * * @param pos Character position * @return column (starting from 0) */ int getCol(int pos) { int index = Arrays.binarySearch(lineStartPositions, pos); if (index >= 0) { return 0; } else { index = -2 - index; return pos - lineStartPositions[index]; } } void list(PrintWriter pw) { for (int i = 0; i < locInfoList.size(); i++) { LocInfo location = (LocInfo) locInfoList.get(i); pw.println( location.seq + ": " + location.toString(this) + " [" + location.getText(xml) + "]"); } pw.flush(); } // enum Type private static final int TYPE_ELEMENT = Node.ELEMENT_NODE, TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE, TYPE_COMMENT = Node.COMMENT_NODE, TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE, TYPE_TEXT = Node.TEXT_NODE; class LocInfo { /** Sequence in document, ordered by start position (prefix order) */ final int seq; /** Node type, typically {@link Node#ELEMENT_NODE}. */ final int startTagStartPos; final int type; int startTagEndPos = -1; // -1 if entity is a single tag int endTagEndPos = -1; /** * Creates a LocInfo. * * @param seq Sequence number in document * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}. * @param startTagStartPos Position of start of element */ LocInfo(int seq, int nodeType, int startTagStartPos) { this.seq = seq; this.type = nodeType; this.startTagStartPos = startTagStartPos; } public String toString(Annotator annotator) { return "line " + annotator.getLine(startTagStartPos) + ", column " + annotator.getCol(startTagStartPos); } /** * Returns the fragment of source XML that this node encompasses. * * @param xml Whole source XML * @return fragment of source XML */ public String getText(String xml) { return xml.substring( startTagStartPos, endTagEndPos >= 0 ? endTagEndPos : xml.length()); } /** * Returns the fragment of source XML corresponding to the head tag * of this element, if this is an element, otherwise the whole node. * * @param xml Whole source XML * @return fragment of source XML */ public String getHeadText(String xml) { return xml.substring( startTagStartPos, startTagEndPos >= 0 ? startTagEndPos : endTagEndPos >= 0 ? endTagEndPos : xml.length()); } public String toString() { return getHeadText(xml); } /** * Returns the text of this location. Specification as for * {@link org.eigenbase.xom.Location#getText(boolean)}. * * @param headOnly Whether to return only the head of elements * @return Source text underlying a location */ public String getText(boolean headOnly) { return xml.substring( startTagStartPos, headOnly && startTagEndPos >= 0 ? startTagEndPos : endTagEndPos >= 0 ? endTagEndPos : xml.length()); } } /** * Similar to {@link Stack} but based on {@link ArrayList} instead of * {@link Vector}, and therefore more efficient. */ private static class ArrayStack extends ArrayList { public final void push(Object t) { if (false) System.out.println(size() + " push [" + t + "]"); add(t); } public final Object peek() { return get(size() - 1); } public final Object pop() { final int index = size() - 1; Object t = remove(index); if (false) System.out.println(size() + " pop [" + t + "]"); return get(index - 1); } } } // End Annotator.java





© 2015 - 2024 Weber Informatics LLC | Privacy Policy