org.apache.solr.schema.SimplePreAnalyzedParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.schema;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.schema.PreAnalyzedField.ParseResult;
import org.apache.solr.schema.PreAnalyzedField.PreAnalyzedParser;

/**
 * Simple plain text format parser for {@link PreAnalyzedField}.
 * Serialization format
 * The format of the serialization is as follows:
 * 
 * content ::= version (stored)? tokens
 * version ::= digit+ " "
 * ; stored field value - any "=" inside must be escaped!
 * stored ::= "=" text "="
 * tokens ::= (token ((" ") + token)*)*
 * token ::= text ("," attrib)*
 * attrib ::= name '=' value
 * name ::= text
 * value ::= text
 * 
 * Special characters in "text" values can be escaped
 * using the escape character \ . The following escape sequences are recognized:
 * 
 * "\ " - literal space character
 * "\," - literal , character
 * "\=" - literal = character
 * "\\" - literal \ character
 * "\n" - newline
 * "\r" - carriage return
 * "\t" - horizontal tab
 * 
 * Please note that Unicode sequences (e.g. \u0001) are not supported.
 * Supported attribute names
 * The following token attributes are supported, and identified with short
 * symbolic names:
 *  * i - position increment (integer)
 * s - token offset, start position (integer)
 * e - token offset, end position (integer)
 * t - token type (string)
 * f - token flags (hexadecimal integer)
 * p - payload (bytes in hexadecimal format; whitespace is ignored)
 * 
 * Token offsets are tracked and implicitly added to the token stream -
 * the start and end offsets consider only the term text and whitespace,
 * and exclude the space taken by token attributes.
 * Example token streams
 *  * 1 one two three
  - version 1
  - stored: 'null'
  - tok: '(term=one,startOffset=0,endOffset=3)'
  - tok: '(term=two,startOffset=4,endOffset=7)'
  - tok: '(term=three,startOffset=8,endOffset=13)'
 1 one  two   three 
  - version 1
  - stored: 'null'
  - tok: '(term=one,startOffset=0,endOffset=3)'
  - tok: '(term=two,startOffset=5,endOffset=8)'
  - tok: '(term=three,startOffset=11,endOffset=16)'
1 one,s=123,e=128,i=22  two three,s=20,e=22
  - version 1
  - stored: 'null'
  - tok: '(term=one,positionIncrement=22,startOffset=123,endOffset=128)'
  - tok: '(term=two,positionIncrement=1,startOffset=5,endOffset=8)'
  - tok: '(term=three,positionIncrement=1,startOffset=20,endOffset=22)'
1 \ one\ \,,i=22,a=\, two\=

  \n,\ =\   \
  - version 1
  - stored: 'null'
  - tok: '(term= one ,,positionIncrement=22,startOffset=0,endOffset=6)'
  - tok: '(term=two=

  
 ,positionIncrement=1,startOffset=7,endOffset=15)'
  - tok: '(term=\,positionIncrement=1,startOffset=17,endOffset=18)'
1 ,i=22 ,i=33,s=2,e=20 , 
  - version 1
  - stored: 'null'
  - tok: '(term=,positionIncrement=22,startOffset=0,endOffset=0)'
  - tok: '(term=,positionIncrement=33,startOffset=2,endOffset=20)'
  - tok: '(term=,positionIncrement=1,startOffset=2,endOffset=2)'
1 =This is the stored part with \= 
 \n    \t escapes.=one two three 
  - version 1
  - stored: 'This is the stored part with = 
 \n    \t escapes.'
  - tok: '(term=one,startOffset=0,endOffset=3)'
  - tok: '(term=two,startOffset=4,endOffset=7)'
  - tok: '(term=three,startOffset=8,endOffset=13)'
1 ==
  - version 1
  - stored: ''
  - (no tokens)
1 =this is a test.=
  - version 1
  - stored: 'this is a test.'
  - (no tokens)
 *  
 */
public final class SimplePreAnalyzedParser implements PreAnalyzedParser {
  static final String VERSION = "1";
  
  private static class Tok {
    StringBuilder token = new StringBuilder();
    Map attr = new HashMap<>();
    
    public boolean isEmpty() {
      return token.length() == 0 && attr.size() == 0;
    }
    
    public void reset() {
      token.setLength(0);
      attr.clear();
    }
    
    @Override
    public String toString() {
      return "tok='" + token + "',attr=" + attr;
    }
  }
  
  // parser state
  private static enum S {TOKEN, NAME, VALUE, UNDEF};
  
  private static final byte[] EMPTY_BYTES = new byte[0];
  
  /** Utility method to convert a hex string to a byte array. */
  static byte[] hexToBytes(String hex) {
    if (hex == null) {
      return EMPTY_BYTES;
    }
    hex = hex.replaceAll("\\s+", "");
    if (hex.length() == 0) {
      return EMPTY_BYTES;
    }
    ByteArrayOutputStream baos = new ByteArrayOutputStream(hex.length() / 2);
    byte b;
    for (int i = 0; i < hex.length(); i++) {
      int high = charToNibble(hex.charAt(i));
      int low = 0;
      if (i < hex.length() - 1) {
        i++;
        low = charToNibble(hex.charAt(i));
      }
      b = (byte)(high << 4 | low);
      baos.write(b);
    }
    return baos.toByteArray();
  }

  static final int charToNibble(char c) {
    if (c >= '0' && c <= '9') {
      return c - '0';
    } else if (c >= 'a' && c <= 'f') {
      return 0xa + (c - 'a');
    } else if (c >= 'A' && c <= 'F') {
      return 0xA + (c - 'A');
    } else {
      throw new RuntimeException("Not a hex character: '" + c + "'");
    }
  }
  
  static String bytesToHex(byte bytes[], int offset, int length) {
    StringBuilder sb = new StringBuilder();
    for (int i = offset; i < offset + length; ++i) {
      sb.append(Integer.toHexString(0x0100 + (bytes[i] & 0x00FF))
                       .substring(1));
    }
    return sb.toString();
  }
  
  public SimplePreAnalyzedParser() {
    
  }

  @Override
  public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
    ParseResult res = new ParseResult();
    StringBuilder sb = new StringBuilder();
    char[] buf = new char[128];
    int cnt;
    while ((cnt = reader.read(buf)) > 0) {
      sb.append(buf, 0, cnt);
    }
    String val = sb.toString();
    // empty string - accept even without version number
    if (val.length() == 0) {
      return res;
    }
    // first consume the version
    int idx = val.indexOf(' ');
    if (idx == -1) {
      throw new IOException("Missing VERSION token");
    }
    String version = val.substring(0, idx);
    if (!VERSION.equals(version)) {
      throw new IOException("Unknown VERSION " + version);
    }
    val = val.substring(idx + 1);
    // then consume the optional stored part
    int tsStart = 0;
    boolean hasStored = false;
    StringBuilder storedBuf = new StringBuilder();
    if (val.charAt(0) == '=') {
      hasStored = true;
      if (val.length() > 1) {
        for (int i = 1; i < val.length(); i++) {
          char c = val.charAt(i);
          if (c == '\\') {
            if (i < val.length() - 1) {
              c = val.charAt(++i);
              if (c == '=') { // we recognize only \= escape in the stored part
                storedBuf.append('=');
              } else {
                storedBuf.append('\\');
                storedBuf.append(c);
                continue;
              }
            } else {
              storedBuf.append(c);
              continue;
            }
          } else if (c == '=') {
            // end of stored text
            tsStart = i + 1;
            break;
          } else {
            storedBuf.append(c);
          }
        }
        if (tsStart == 0) { // missing end-of-stored marker
          throw new IOException("Missing end marker of stored part");
        }
      } else {
        throw new IOException("Unexpected end of stored field");
      }
    }
    if (hasStored) {
      res.str = storedBuf.toString();
    }
    Tok tok = new Tok();
    StringBuilder attName = new StringBuilder();
    StringBuilder attVal = new StringBuilder();
    // parser state
    S s = S.UNDEF;
    int lastPos = 0;
    for (int i = tsStart; i < val.length(); i++) {
      char c = val.charAt(i);
      if (c == ' ') {
        // collect leftovers
        switch (s) {
        case VALUE :
          if (attVal.length() == 0) {
            throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
          }
          if (attName.length() > 0) {
            tok.attr.put(attName.toString(), attVal.toString());
          }
          break;
        case NAME: // attr name without a value ?
          if (attName.length() > 0) {
            throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
          } else {
            // accept missing att name and value
          }
          break;
        case TOKEN:
        case UNDEF:
          // do nothing, advance to next token
        }
        attName.setLength(0);
        attVal.setLength(0);
        if (!tok.isEmpty() || s == S.NAME) {
          AttributeSource.State state = createState(parent, tok, lastPos);
          if (state != null) res.states.add(state.clone());
        }
        // reset tok
        s = S.UNDEF;
        tok.reset();
        // skip
        lastPos++;
        continue;
      }
      StringBuilder tgt = null;
      switch (s) {
      case TOKEN:
        tgt = tok.token;
        break;
      case NAME:
        tgt = attName;
        break;
      case VALUE:
        tgt = attVal;
        break;
      case UNDEF:
        tgt = tok.token;
        s = S.TOKEN;
      }
      if (c == '\\') {
        if (s == S.TOKEN) lastPos++;
        if (i >= val.length() - 1) { // end
          
          tgt.append(c);
          continue;
        } else {
          c = val.charAt(++i);
          switch (c) {
          case '\\' :
          case '=' :
          case ',' :
          case ' ' :
            tgt.append(c);
            break;
          case 'n':
            tgt.append('\n');
            break;
          case 'r':
            tgt.append('\r');
            break;
          case 't':
            tgt.append('\t');
            break;
          default:
            tgt.append('\\');
            tgt.append(c);
            lastPos++;
          }
        }
      } else {
        // state switch
        if (c == ',') {
          if (s == S.TOKEN) {
            s = S.NAME;
          } else if (s == S.VALUE) { // end of value, start of next attr
            if (attVal.length() == 0) {
              throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
            }
            if (attName.length() > 0 && attVal.length() > 0) {
              tok.attr.put(attName.toString(), attVal.toString());
            }
            // reset
            attName.setLength(0);
            attVal.setLength(0);
            s = S.NAME;
          } else {
            throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
          }
        } else if (c == '=') {
          if (s == S.NAME) {
            s = S.VALUE;
          } else {
            throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
          }
        } else {
          tgt.append(c);
          if (s == S.TOKEN) lastPos++;
        }
      }
    }
    // collect leftovers
    if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
      // remaining attrib?
      if (s == S.VALUE) {
        if (attName.length() > 0 && attVal.length() > 0) {
          tok.attr.put(attName.toString(), attVal.toString());
        }        
      }
      AttributeSource.State state = createState(parent, tok, lastPos);
      if (state != null) res.states.add(state.clone());
    }
    return res;
  }
  
  private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
    a.clearAttributes();
    CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
    char[] tokChars = state.token.toString().toCharArray();
    termAtt.copyBuffer(tokChars, 0, tokChars.length);
    int tokenStart = tokenEnd - state.token.length();
    for (Entry e : state.attr.entrySet()) {
      String k = e.getKey();
      if (k.equals("i")) {
        // position increment
        int incr = Integer.parseInt(e.getValue());
        PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
        posIncr.setPositionIncrement(incr);
      } else if (k.equals("s")) {
        tokenStart = Integer.parseInt(e.getValue());
      } else if (k.equals("e")) {
        tokenEnd = Integer.parseInt(e.getValue());
      } else if (k.equals("y")) {
        TypeAttribute type = a.addAttribute(TypeAttribute.class);
        type.setType(e.getValue());
      } else if (k.equals("f")) {
        FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
        int f = Integer.parseInt(e.getValue(), 16);
        flags.setFlags(f);
      } else if (k.equals("p")) {
        PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
        byte[] data = hexToBytes(e.getValue());
        if (data != null && data.length > 0) {
          p.setPayload(new BytesRef(data));
        }
      } else {
        // unknown attribute
      }
    }
    // handle offset attr
    OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
    offset.setOffset(tokenStart, tokenEnd);
    State resState = a.captureState();
    a.clearAttributes();
    return resState;
  }

  @Override
  public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
      String s = f.stringValue();
      if (s != null) {
        // encode the equals sign
        s = s.replaceAll("=", "\\=");
        sb.append('=');
        sb.append(s);
        sb.append('=');
      }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
      StringBuilder tok = new StringBuilder();
      boolean next = false;
      while (ts.incrementToken()) {
        if (next) {
          sb.append(' ');
        } else {
          next = true;
        }
        tok.setLength(0);
        Iterator> it = ts.getAttributeClassesIterator();
        String cTerm = null;
        String tTerm = null;
        while (it.hasNext()) {
          Class cl = it.next();
          Attribute att = ts.getAttribute(cl);
          if (att == null) {
            continue;
          }
          if (cl.isAssignableFrom(CharTermAttribute.class)) {
            CharTermAttribute catt = (CharTermAttribute)att;
            cTerm = escape(catt.buffer(), catt.length());
          } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
            TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
            char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
            tTerm = escape(tTermChars, tTermChars.length);
          } else {
            if (tok.length() > 0) tok.append(',');
            if (cl.isAssignableFrom(FlagsAttribute.class)) {
              tok.append("f=").append(Integer.toHexString(((FlagsAttribute) att).getFlags()));
            } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
              tok.append("s=").append(((OffsetAttribute) att).startOffset()).append(",e=").append(((OffsetAttribute) att).endOffset());
            } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
              BytesRef p = ((PayloadAttribute)att).getPayload();
              if (p != null && p.length > 0) {
                tok.append("p=").append(bytesToHex(p.bytes, p.offset, p.length));
              } else if (tok.length() > 0) {
                tok.setLength(tok.length() - 1); // remove the last comma
              }
            } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
              tok.append("i=").append(((PositionIncrementAttribute) att).getPositionIncrement());
            } else if (cl.isAssignableFrom(TypeAttribute.class)) {
              tok.append("y=").append(escape(((TypeAttribute) att).type()));
            } else {
              
              tok.append(cl.getName()).append('=').append(escape(att.toString()));
            }
          }
        }
        String term = null;
        if (cTerm != null) {
          term = cTerm;
        } else {
          term = tTerm;
        }
        if (term != null && term.length() > 0) {
          if (tok.length() > 0) {
            tok.insert(0, term + ",");
          } else {
            tok.insert(0, term);
          }
        }
        sb.append(tok);
      }
    }
    return sb.toString();
  }
    
  String escape(String val) {
    return escape(val.toCharArray(), val.length());
  }
  
  String escape(char[] val, int len) {
    if (val == null || len == 0) {
      return "";
    }
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < len; i++) {
      switch (val[i]) {
      case '\\' :
      case '=' :
      case ',' :
      case ' ' :
        sb.append('\\');
        sb.append(val[i]);
        break;
      case '\n' :
        sb.append('\\');
        sb.append('n');
        break;
      case '\r' :
        sb.append('\\');
        sb.append('r');
        break;
      case '\t' :
        sb.append('\\');
        sb.append('t');
        break;
      default:
        sb.append(val[i]);
      }
    }
    return sb.toString();
  }
  
}