All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.tagger.OffsetCorrector Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.util.Arrays;

import com.carrotsearch.hppc.IntArrayList;

public abstract class OffsetCorrector {

  //TODO support a streaming style of consuming input text so that we need not take a
  // String. Trickier because we need to keep more information as we parse to know when tags
  // are adjacent with/without whitespace

  //Data structure requirements:
  // Given a character offset:
  //   * determine what tagId is it's parent.
  //   * determine if it is adjacent to the parent open tag, ignoring whitespace
  //   * determine if it is adjacent to the parent close tag, ignoring whitespace
  // Given a tagId:
  //   * What is it's parent tagId
  //   * What's the char offset of the start and end of the open tag
  //   * What's the char offset of the start and end of the close tag

  /** Document text. */
  protected final String docText;

  /** Array of tag info comprised of 5 int fields:
   *    [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff].
   * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */
  protected final IntArrayList tagInfo;

  /** offsets of parent tag id change (ascending order) */
  protected final IntArrayList parentChangeOffsets;
  /** tag id; parallel array to parentChangeOffsets */
  protected final IntArrayList parentChangeIds;

  protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state

  /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */
  protected final IntArrayList nonTaggableOffsets;

  /**
   * Initialize based on the document text.
   * @param docText non-null structured content.
   * @param hasNonTaggable if there may be "non-taggable" tags to track
   */
  protected OffsetCorrector(String docText, boolean hasNonTaggable) {
    this.docText = docText;
    final int guessNumElements = Math.max(docText.length() / 20, 4);

    tagInfo = new IntArrayList(guessNumElements * 5);
    parentChangeOffsets = new IntArrayList(guessNumElements * 2);
    parentChangeIds = new IntArrayList(guessNumElements * 2);
    nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null;
  }

  /** Corrects the start and end offset pair. It will return null if it can't
   * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags.
   * The start (left) offset is pulled left as needed over whitespace and opening tags. The end
   * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as
   * a 2-element array.
   * 

Note that the returned array is internally reused; just use it to examine the response. */ public int[] correctPair(int leftOffset, int rightOffset) { rightOffset = correctEndOffsetForCloseElement(rightOffset); if (spansNonTaggable(leftOffset, rightOffset)) return null; int startTag = lookupTag(leftOffset); //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag)); int endTag = lookupTag(rightOffset-1); //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag)); // Find the ancestor tag enclosing offsetPair. And bump out left offset along the way. int iTag = startTag; for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) { //Ensure there is nothing except whitespace thru OpenEndOff int tagOpenEndOff = getOpenEndOff(iTag); if (hasNonWhitespace(tagOpenEndOff, leftOffset)) return null; leftOffset = getOpenStartOff(iTag); } final int ancestorTag = iTag; // Bump out rightOffset until we get to ancestorTag. for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) { //Ensure there is nothing except whitespace thru CloseStartOff int tagCloseStartOff = getCloseStartOff(iTag); if (hasNonWhitespace(rightOffset, tagCloseStartOff)) return null; rightOffset = getCloseEndOff(iTag); } offsetPair[0] = leftOffset; offsetPair[1] = rightOffset; return offsetPair; } /** Correct endOffset for adjacent element at the right side. E.g. offsetPair might point to: *

   *   foo</tag>
   * 
* and this method pulls the end offset left to the '<'. This is necessary for use with * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}. * * See https://issues.apache.org/jira/browse/LUCENE-5734 */ protected int correctEndOffsetForCloseElement(int endOffset) { if (docText.charAt(endOffset-1) == '>') { final int newEndOffset = docText.lastIndexOf('<', endOffset - 2); if (newEndOffset > offsetPair[0])//just to be sure return newEndOffset; } return endOffset; } protected boolean hasNonWhitespace(int start, int end) { for (int i = start; i < end; i++) { if (!Character.isWhitespace(docText.charAt(i))) return true; } return false; } protected boolean tagEnclosesOffset(int tag, int off) { return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag); } protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); } protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); } protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); } protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); } protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); } protected int lookupTag(int off) { int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off); if (idx < 0) idx = (-idx - 1) - 1;//round down return parentChangeIds.get(idx); } protected boolean spansNonTaggable(int startOff, int endOff) { if (nonTaggableOffsets == null) return false; int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff); //if tag start coincides with first or last char of non-taggable span then result is true. // (probably never happens since those characters are actual element markup) if (idx >= 0) return true; idx = -idx - 1;//modify for where we would insert //if idx is odd then our span intersects a non-taggable span; return true if ((idx & 1) == 1) return true; //it's non-taggable if the next non-taggable start span is before our endOff if (idx == nonTaggableOffsets.size()) return false; return nonTaggableOffsets.get(idx) < endOff; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy