org.apache.solr.handler.tagger.XmlOffsetCorrector Maven / Gradle / Ivy

/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import java.io.InputStream;
import java.io.StringReader;

import com.ctc.wstx.stax.WstxInputFactory;
import org.apache.commons.io.input.ClosedInputStream;
import org.codehaus.stax2.LocationInfo;
import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.XMLStreamReader2;

/**
 * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be
 * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end
 * offset of the tagger, and have it be valid XML.  See {@link #correctPair(int, int)}.
 *
 * This will not work on invalid XML.
 *
 * Not thread-safe.
 */
public class XmlOffsetCorrector extends OffsetCorrector {

  //TODO use StAX without hard requirement on woodstox.   xmlStreamReader.getLocation().getCharacterOffset()

  private static final XMLInputFactory2 XML_INPUT_FACTORY;
  static {
    // note: similar code in Solr's EmptyEntityResolver
    XML_INPUT_FACTORY = new WstxInputFactory();
    XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() {
      @Override
      public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) {
        return ClosedInputStream.CLOSED_INPUT_STREAM;
      }
    });
    // TODO disable DTD?
    // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE)
    XML_INPUT_FACTORY.configureForSpeed();
  }

  /**
   * Initialize based on the document text.
   * @param docText non-null XML content.
   * @throws XMLStreamException If there's a problem parsing the XML.
   */
  public XmlOffsetCorrector(String docText) throws XMLStreamException {
    super(docText, false);

    int tagCounter = 0;
    int thisTag = -1;

    //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag,
    // but we shouldn't need to because there is no findable text outside the top element.

    final XMLStreamReader2 xmlStreamReader =
            (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText));

    while (xmlStreamReader.hasNext()) {
      int eventType = xmlStreamReader.next();
      switch (eventType) {
        case XMLEvent.START_ELEMENT: {
          tagInfo.ensureCapacity(tagInfo.size() + 5);
          final int parentTag = thisTag;
          final LocationInfo info = xmlStreamReader.getLocationInfo();
          tagInfo.add(parentTag);
          tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset());
          tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag
          thisTag = tagCounter++;

          parentChangeOffsets.add((int) info.getStartingCharOffset());
          parentChangeIds.add(thisTag);
          break;
        }
        case XMLEvent.END_ELEMENT: {
          final LocationInfo info = xmlStreamReader.getLocationInfo();
          tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset());
          tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset());
          thisTag = getParentTag(thisTag);

          parentChangeOffsets.add((int) info.getEndingCharOffset());
          parentChangeIds.add(thisTag);
          break;
        }
        default: //do nothing
      }
    }
  }

}