com.javanut.pronghorn.util.TrieParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pronghorn-pipes Show documentation
Show all versions of pronghorn-pipes Show documentation
Ring buffer based queuing utility for applications that require high performance and/or a small
footprint. Well suited for embedded and stream based processing.
package com.javanut.pronghorn.util;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Serializable;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.javanut.pronghorn.pipe.Pipe;
import com.javanut.pronghorn.pipe.RawDataSchema;
/**
* Optimized for fast lookup and secondarily size.
* Inserts may require data copy and this could be optimized in future releases if needed.
*
* @author Nathan Tippy
*
*/
public class TrieParser implements Serializable {
//TODO: new feature under development, Not working yet with JSON streaming or escape escape.
static final boolean doSupportSwitch = true; //Still working on feature
private static final long serialVersionUID = -2877089562575447986L;
private static final Logger logger = LoggerFactory.getLogger(TrieParser.class);
public static final byte TYPE_RUN = 0x00; //followed by length
public static final byte TYPE_BRANCH_VALUE = 0x01; //followed by mask & short jump
public static final byte TYPE_ALT_BRANCH = 0X02; //followed by 2 short jump, try first upon falure use second.
public static final byte TYPE_SWITCH_BRANCH = 0X03; //followed by 1 short (Hi: base offset)|(Lo: trie len) followed by pairs of shorts for run
public static final byte TYPE_VALUE_NUMERIC = 0x04; //followed by type, parse right kind of number
public static final byte TYPE_VALUE_BYTES = 0x05; //followed by stop byte, take all until stop byte encountered (AKA Wild Card)
public static final byte TYPE_SAFE_END = 0X06;
public static final byte TYPE_END = 0x07;
static final int BRANCH_JUMP_SIZE = 2;
static final int SIZE_OF_BRANCH = 1+1+BRANCH_JUMP_SIZE; //type, branchon, jumpvalue
static final int SIZE_OF_ALT_BRANCH = 1 +BRANCH_JUMP_SIZE; //type, jumpvalue
static final int SIZE_OF_RUN = 1+1;
final int SIZE_OF_RESULT;
final int SIZE_OF_END_1;
final int SIZE_OF_SAFE_END;
static final int SIZE_OF_VALUE_NUMERIC = 1+1; //second value is type mask
static final int SIZE_OF_VALUE_BYTES = 1+1; //second value is stop marker
final boolean skipDeepChecks;//these runs are not significant and do not provide any consumed data.
//humans require long readable URLs but the machine can split them into categories on just a few key bytes
public final byte ESCAPE_BYTE;
public final byte NO_ESCAPE_SUPPORT=(byte)0xFF;
//EXTRACT VALUE
public static final byte ESCAPE_CMD_OPTIONAL_SIGNED_INT = 'o'; //optional signed int, if absent returns zero
public static final byte ESCAPE_CMD_SIGNED_INT = 'i'; //signedInt (may be hex if starts with 0x)
public static final byte ESCAPE_CMD_UNSIGNED_INT = 'u'; //unsignedInt (may be hex if starts with 0x)
public static final byte ESCAPE_CMD_SIGNED_HEX = 'I'; //signedInt (may skip prefix 0x, assumed to be hex)
public static final byte ESCAPE_CMD_UNSIGNED_HEX = 'U'; //unsignedInt (may skip prefix 0x, assumed to be hex)
public static final byte ESCAPE_CMD_DECIMAL = '.'; //if found capture u and places else captures zero and 1 place
public static final byte ESCAPE_CMD_RATIONAL = '/'; //if found capture i else captures 1
//EXTRACTED BYTES
public static final byte ESCAPE_CMD_BYTES = 'b';
//////////////////////////////////////////////////////////////////////
///Every pattern is unaware of any context and can be mixed an any way.
//////////////////////////////////////////////////////////////////////
// %% a literal %
// %i%. unsigned value after dot in decimal and zero if not found eg 3.75
// %i%/ signed value after dot in hex and 1 if not found eg 3/-4
// %i%.%/%. a rational number made up of two decimals eg 2.3/-1.7
// %bX where X is the excluded stop short
//////////////////////////////////////////////////////////////////////
//TODO: add support for %Ni where we capture a number of fixed length N
// the numbers supported are upper-case 012345789ABCDEFGHIJKLMNOPQRSTUVWXYZ
// this will be for HHmmSS extraction into 3 fields...
//numeric type bits:
// leading sign (only in front)
static final short NUMERIC_FLAG_SIGN = 1;
// hex values can start with 0x, hex is all lower case abcdef
static final short NUMERIC_FLAG_HEX = 2;
// starts with . if not return zero
static final short NUMERIC_FLAG_DECIMAL = 4;
// starts with / if not return 1
static final short NUMERIC_FLAG_RATIONAL = 8;
// when there is no number take that path and use zero as the value
static final short NUMERIC_FLAG_ABSENT_IS_ZERO = (short)0x8000;
private final boolean fixedSize;
short[] data;
int limit = 0;
private final int MAX_TEXT_LENGTH = 1024;
private transient Pipe workingPipe = RawDataSchema.instance.newPipe(2,MAX_TEXT_LENGTH);
private int maxExtractedFields = 0;//out of all the byte patterns known what is the maximum # of extracted fields from any of them.
private final static int MAX_ALT_DEPTH = 128;
private int altStackPos = 0;
private int[] altStackA = new int[MAX_ALT_DEPTH];
private int[] altStackB = new int[MAX_ALT_DEPTH];
int activeExtractionCount;
byte[] extractions = new byte[32];//hard coded limit of extraction points, could be larger but why...
/**
* Provides visibility into which fields will be extracted from the last known pattern
*/
public byte[] lastSetValueExtractonPattern() {
return Arrays.copyOfRange(extractions, 0, activeExtractionCount);
}
public int lastSetValueExtractionCount() {
return activeExtractionCount;
}
public int maxExtractedFields() {
return maxExtractedFields;
}
public static int maxExtractedFields(TrieParser that) {
return that.maxExtractedFields;
}
//used for detection of parse errors, eg do we need more data or did something bad happen.
private int maxBytesCapturable = 500; //largest text
private int maxNumericLenCapturable = 20; //largest numeric.
public TrieParser() {
this(256);
}
public TrieParser(int size) {
this(size, 1, false, true);
}
public TrieParser(int size, boolean skipDeepChecks) {
this(size, 1, skipDeepChecks, true);
}
public TrieParser(int size, int resultSize, boolean skipDeepChecks, boolean supportsExtraction) {
this(size,resultSize,skipDeepChecks,supportsExtraction,false);
}
public TrieParser(int size, int resultSize, boolean skipDeepChecks, boolean supportsExtraction, boolean ignoreCase) {
this(size,resultSize,skipDeepChecks,supportsExtraction,ignoreCase,(byte)'%'); //default escape is set here.
}
public TrieParser(int size, int resultSize, boolean skipDeepChecks, boolean supportsExtraction, boolean ignoreCase, byte customEscape) {
this.data = new short[size];
this.fixedSize = false; //if its not fixed size then the .data array will grow as needed.
this.workingPipe.initBuffers();
this.SIZE_OF_RESULT = resultSize; //custom result size for this instance
this.SIZE_OF_END_1 = 1+SIZE_OF_RESULT;
this.SIZE_OF_SAFE_END = 1+SIZE_OF_RESULT;//Same as end except we keep going and store this
this.skipDeepChecks = skipDeepChecks;
if (supportsExtraction) {
assert(customEscape!=NO_ESCAPE_SUPPORT);
ESCAPE_BYTE = customEscape; //set custom escape char for the case that we need to use %
} else {
ESCAPE_BYTE = NO_ESCAPE_SUPPORT;
}
this.caseRuleMask = ignoreCase ? (byte)0xDF : (byte)0xFF;
}
public static int getLimit(TrieParser that) {
return that.limit;
}
public int getLimit() {
return limit;
}
public boolean isSkipDeepChecks() {
return skipDeepChecks;
}
public void setValue(byte[] source, int offset, int length, int mask, long value) {
setValue(0, source, offset, length, mask, value);
}
public String toString() {
return toString(new StringBuilder()).toString();
}
public StringBuilder toString(StringBuilder builder) {
int i = 0;
while (i>8)&0xFF;
int trieLen = meta&0xFF;
builder.append(base).append("+").append(trieLen).append("[").append(i++).append("], \n"); //meta shift.count
int b = i;
//jumps
for(int k=0; k>1;
int value = (0xFF)&steps+base;
if (data[i]>=0) {
if (value<127 && value>=32) {
builder.append("case: '").append((char)value).append("' ");
} else {
builder.append("case: ").append(value).append(" ");
}
int j = data[i]<<15;
builder.append(data[i]).append("[").append(i++).append("], ");
j |= (data[i]&0x7FFF);
builder.append(data[i]).append("[").append(i++).append("], ");//JUMP
j += (metaBase+(trieLen<<1));
if (j>=data.length) {
builder.append("ERROR: OUT OF RANGE ");
}
builder.append(" jumpTo:").append(j).append("\n");
} else {
i+=2;
}
}
return i;
}
private int toStringSafe(StringBuilder builder, int i) {
builder.append("SAFE");
builder.append(data[i]).append("[").append(i++).append("], ");
int s = SIZE_OF_RESULT;
while (--s >= 0) {
builder.append(data[i]).append("[").append(i++).append("], ");
}
builder.append("\n");
return i;
}
private int toStringNumeric(StringBuilder builder, int i) {
builder.append("EXTRACT_NUMBER");
builder.append(data[i]).append("[").append(i++).append("], ");
builder.append(data[i]).append("[").append(i++).append("], \n");
return i;
}
private int toStringBytes(StringBuilder builder, int i) {
builder.append("EXTRACT_BYTES");
builder.append(data[i]).append("[").append(i++).append("], ");
builder.append(data[i]).append("[").append(i++).append("], \n");
return i;
}
private int toStringEnd(StringBuilder builder, int i) {
builder.append("END");
builder.append(data[i]).append("[").append(i++).append("], ");
int s = SIZE_OF_RESULT;
while (--s >= 0) {
builder.append(data[i]).append("[").append(i++).append("], ");
}
builder.append("\n");
return i;
}
private int toStringRun(StringBuilder builder, int i) {
builder.append("RUN");
builder.append(data[i]).append("[").append(i++).append("], ");
int len = data[i];
builder.append(data[i]).append("[").append(i++).append("], ");
while (--len >= 0) {
builder.append(data[i]);
if ((data[i]>=32) && (data[i]<=126)) {
builder.append("'").append((char)data[i]).append("'");
}
builder.append("[").append(i++).append("], ");
}
builder.append("\n");
return i;
}
private int toStringAltBranch(StringBuilder builder, int i) {
builder.append("ALT_BRANCH");
builder.append(data[i]).append("[").append(i++).append("], "); //TYPE
int j = data[i]<<15;
builder.append(data[i]).append("[").append(i++).append("], ");
j |= (data[i]&0x7FFF);
builder.append(data[i]).append("[").append(i++).append("], jumpTo:"+(i+j));//JUMP
builder.append("\n");
return i;
}
private int toStringBranchValue(StringBuilder builder, int i) {
builder.append("BRANCH_VALUE");
builder.append(data[i]).append("[").append(i++).append("], "); //TYPE
String binaryString = "00000000"+Integer.toBinaryString(data[i]);
builder.append(binaryString.substring(binaryString.length()-8));
builder.append("[").append(i++).append("], "); //MASK FOR CHAR
int j = data[i]<<15;
builder.append(data[i]).append("[").append(i++).append("], ");
j |= (data[i]&0x7FFF);
builder.append(data[i]).append("[").append(i++).append("], jumpTo:"+(i+j));//JUMP
builder.append("\n");
return i;
}
public void visitPatterns(TrieParserVisitor pv) {
byte[] buffer = new byte[data.length];//can not be longer than this.
visitPatterns(pv, 0, buffer, 0);
}
private void visitPatterns(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
if (i>8)&0xFF;
int trieLen = meta&0xFF;
int base = i+1;
for(int k = 0; k=0) {
visitPatterns(pv,(i+(trieLen<<1)) + jump,buffer,bufferPosition);
}
}
}
private void visitSafeEnd(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
assert(TYPE_SAFE_END == data[i]);
i++;//skip over the ID;
int s = SIZE_OF_RESULT;
long result = 0;
while (--s >= 0) {
result = (result<<8) | ((long)data[i++]);
}
pv.visit(buffer, bufferPosition, result);
visitPatterns(pv,i,buffer,bufferPosition);
}
private void visitAltBranch(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
assert(TYPE_ALT_BRANCH == data[i]);
i++;//skip over the ID;
i++;
i++;
int destination = i + ((((int)data[i-2])<<15) | (0x7FFF&data[i-1]));
visitPatterns(pv,i,buffer,bufferPosition);
visitPatterns(pv,destination,buffer,bufferPosition);
}
private void visitNomBranch(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
assert(TYPE_BRANCH_VALUE == data[i]);
i++;//skip over the ID;
short mask = data[i++];
i++;
i++;
int destination = i + ((((int)data[i-2])<<15) | (0x7FFF&data[i-1]));
visitPatterns(pv,i,buffer,bufferPosition);
visitPatterns(pv,destination,buffer,bufferPosition);
}
private void visitNumeric(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
assert(TYPE_VALUE_NUMERIC == data[i]);
i++;//skip over the ID;
buffer[bufferPosition++] = TYPE_VALUE_NUMERIC;
buffer[bufferPosition++] = (byte)data[i++]; //type
visitPatterns(pv,i,buffer,bufferPosition);
}
private void visitBytes(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
assert(TYPE_VALUE_BYTES == data[i]);
i++;//skip over the ID;
buffer[bufferPosition++] = TYPE_VALUE_BYTES;
buffer[bufferPosition++] = (byte)data[i++]; //stopper
visitPatterns(pv,i,buffer,bufferPosition);
}
private void visitRun(TrieParserVisitor pv, int i, byte[] buffer, int bufferPosition) {
assert(TYPE_RUN == data[i]);
i++;//skip over the ID;
int runLen = data[i++];
for(int j=0;j= 0) {
result = (result<<16) | (0xFFFF&(long)data[i++]);
}
pv.visit(buffer, bufferPosition, result);
}
public A toDOT(A builder) {
try{
//builder.append("# dot -Tsvg -otemp.svg temp.dot\n");
builder.append("digraph {\n");
int i = 0;
while (i>8)&0xFF;
int trieLen = meta&0xFF;
Appendables.appendValue(builder, offset).append(".");
Appendables.appendValue(builder, trieLen).append("[");
Appendables.appendValue(builder, i++).append("], "); //meta shift.count
//jumps
int c = trieLen;
while (--c>=0) {
Appendables.appendValue(builder, data[i]).append("[");
Appendables.appendValue(builder, i++).append("], ");
Appendables.appendValue(builder, data[i]).append("[");
Appendables.appendValue(builder, i++).append("], ");//JUMP
}
//end of label
builder.append("\"]\n");
//add jumps
int base = start+1+(trieLen<<1);
for(int j = 0; j");
Appendables.appendValue(builder,"node", destination, "\n"); //jump
}
return i;
}
private int toDotSafe(Appendable builder, int i) throws IOException {
int start = i;
builder.append("SAFE");
i++;//builder.append(data[i]).append("[").append(i++).append("], ");
int s = SIZE_OF_RESULT;
while (--s >= 0) {
Appendables.appendValue(builder, data[i]);
builder.append("[");
Appendables.appendValue(builder,i++);
builder.append("], ");
}
//end of label
builder.append("\"]\n");
Appendables.appendValue(builder,"node", start);
builder.append("->");
Appendables.appendValue(builder,"node", i, "\n"); //local
return i;
}
private int toDotNumeric(Appendable builder, int i) throws IOException {
int start = i;
builder.append("EXTRACT_NUMBER");
Appendables.appendValue(builder, data[i]);
builder.append("[");
Appendables.appendValue(builder, i++);
builder.append("], ");
Appendables.appendValue(builder, data[i]);
builder.append("[");
Appendables.appendValue(builder, i++);
builder.append("]");
//end of label
builder.append("\"]\n");
Appendables.appendValue(builder,"node", start);
builder.append("->");
Appendables.appendValue(builder,"node", i, "\n"); //local
return i;
}
private int toDotBytes(Appendable builder, int i) throws IOException {
int start = i;
builder.append("EXTRACT_BYTES");
Appendables.appendValue(builder,data[i]).append("[");
Appendables.appendValue(builder,i++).append("], ");
Appendables.appendValue(builder,data[i]).append("[");
Appendables.appendValue(builder,i++).append("]");
//end of label
builder.append("\"]\n");
Appendables.appendValue(builder,"node", start);
builder.append("->");
Appendables.appendValue(builder,"node", i, "\n"); //local
return i;
}
private int toDotEnd(Appendable builder, int i) throws IOException {
builder.append("END");
i++;//builder.append(data[i]).append("[").append(i++).append("], ");
int s = SIZE_OF_RESULT;
while (--s >= 0) {
Appendables.appendValue(builder,data[i]).append("[");
Appendables.appendValue(builder,i++).append("]");
}
//end of label
builder.append("\"]\n");
return i;
}
private int toDotRun(Appendable builder, int i) throws IOException {
int start = i;
//builder.append("RUN of ");
i++;//builder.append(data[i]).append("[").append(i++).append("], ");
int len = data[i];
Appendables.appendValue(builder,"RUN of ", len, "\n");
i++;//builder.append(data[i]).append("[").append(i++).append("]\n ");
while (--len >= 0) {
if ((data[i]>=32) && (data[i]<=126)) {
char c = (char)data[i];
if (c=='"' || c=='\\') {
builder.append('\\');
}
builder.append(c);
} else {
builder.append("{");
Appendables.appendValue(builder,data[i]).append("}");
}
i++;
}
//end of label
builder.append("\"]\n");
Appendables.appendValue(builder,"node", start);
builder.append("->");
Appendables.appendValue(builder,"node", i, "\n"); //local
return i;
}
private int toDotAltBranch(Appendable builder, int i) throws IOException {
int start = i;
builder.append("ALT_BRANCH");
Appendables.appendValue(builder,data[i]);
builder.append("[");
Appendables.appendValue(builder,i++).append("], "); //TYPE
//assert(data[i]>=0);
Appendables.appendValue(builder,data[i]).append("[");
Appendables.appendValue(builder,i++).append("], ");//JUMP
Appendables.appendValue(builder,data[i]).append("[");
Appendables.appendValue(builder,i++).append("]"); //JUMP
//end of label
builder.append("\"]\n");
//add jumps
Appendables.appendValue(builder,"node", start);
builder.append("->");
Appendables.appendValue(builder,"node", i, "\n"); //local
Appendables.appendValue(builder,"node", start);
int destination = i + ((((int)data[i-2])<<15) | (0x7FFF&data[i-1]));
builder.append("->");
Appendables.appendValue(builder,"node", destination, "\n"); //jump
return i;
}
private int toDotBranchValue(Appendable builder, int i) throws IOException {
int start = i;
builder.append("BRANCH ON BIT\n");
i++;// builder.append(data[i]).append("[").append(i++).append("], "); //TYPE
builder.append(" bit:");
String bits = ("00000000"+Integer.toBinaryString(data[i])); //TODO: THIS IS A HACK FOR NOW, MOVE TO Appendables. we need binary support there.
builder.append(bits.substring(bits.length()-8,bits.length()));
i++;//builder.append("[").append(i++).append("], "); //MASK FOR CHAR
i++;//builder.append(data[i]).append("[").append(i++).append("], "); //JUMP
i++;//builder.append(data[i]).append("[").append(i++).append("]");//JUMP
//end of label
builder.append("\"]\n");
//add jumps
Appendables.appendValue(builder,"node", start);
builder.append("->");
Appendables.appendValue(builder,"node", i, "\n"); //local
Appendables.appendValue(builder,"node", start);
int destination = i + ((((int)data[i-2])<<15) | (0x7FFF&data[i-1]));
builder.append("->");
Appendables.appendValue(builder,"node", destination, "\n"); //jump
return i;
}
static int computeJumpMask(short source, short critera) {
return ~(((source & (0xFF & critera))-1)>>>8) ^ critera>>>8;
}
public int setUTF8Value(CharSequence cs, long value) {
if (cs.length()<<3 > workingPipe.maxVarLen) {
workingPipe = RawDataSchema.instance.newPipe(2,cs.length());
workingPipe.initBuffers();
}
Pipe.addMsgIdx(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1);
int origPos = Pipe.getWorkingBlobHeadPosition(workingPipe);
int len = Pipe.copyUTF8ToByte(cs, 0, cs.length(), workingPipe);
Pipe.addBytePosAndLen(workingPipe, origPos, len);
Pipe.publishWrites(workingPipe);
Pipe.confirmLowLevelWrite(workingPipe, Pipe.sizeOf(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1));
Pipe.takeMsgIdx(workingPipe);
setValue(workingPipe, value);
Pipe.confirmLowLevelRead(workingPipe, Pipe.sizeOf(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1));
//WARNING: this is not thread safe if set is called and we have not yet parsed!!
Pipe.releaseReadLock(workingPipe);
return len;
}
public int setUTF8Value(CharSequence cs, CharSequence suffix, long value) {
if ((cs.length()+suffix.length())<<3 > workingPipe.maxVarLen) {
workingPipe = RawDataSchema.instance.newPipe(2,suffix.length());
workingPipe.initBuffers();
}
Pipe.addMsgIdx(workingPipe, 0);
int origPos = Pipe.getWorkingBlobHeadPosition(workingPipe);
int len = 0;
len += Pipe.copyUTF8ToByte(cs, 0, cs.length(), workingPipe);
len += Pipe.copyUTF8ToByte(suffix, 0, suffix.length(), workingPipe);
Pipe.addBytePosAndLen(workingPipe, origPos, len);
Pipe.publishWrites(workingPipe);
Pipe.confirmLowLevelWrite(workingPipe, Pipe.sizeOf(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1));
Pipe.takeMsgIdx(workingPipe);
setValue(workingPipe, value);
Pipe.confirmLowLevelRead(workingPipe, Pipe.sizeOf(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1));
//WARNING: this is not thread safe if set is called and we have not yet parsed!!
Pipe.releaseReadLock(workingPipe);
return len;
}
public int setUTF8Value(CharSequence prefix, CharSequence cs, CharSequence suffix, long value) {
if (((prefix.length() + cs.length() + suffix.length()) << 3) > workingPipe.maxVarLen) {
workingPipe = RawDataSchema.instance.newPipe(2,suffix.length());
workingPipe.initBuffers();
}
Pipe.addMsgIdx(workingPipe, 0);
int origPos = Pipe.getWorkingBlobHeadPosition(workingPipe);
int len = 0;
len += Pipe.copyUTF8ToByte(prefix, 0, prefix.length(), workingPipe);
len += Pipe.copyUTF8ToByte(cs, 0, cs.length(), workingPipe);
len += Pipe.copyUTF8ToByte(suffix, 0, suffix.length(), workingPipe);
Pipe.addBytePosAndLen(workingPipe, origPos, len);
Pipe.publishWrites(workingPipe);
Pipe.confirmLowLevelWrite(workingPipe, Pipe.sizeOf(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1));
Pipe.takeMsgIdx(workingPipe);
setValue(workingPipe, value);
Pipe.confirmLowLevelRead(workingPipe, Pipe.sizeOf(workingPipe, RawDataSchema.MSG_CHUNKEDSTREAM_1));
//WARNING: this is not thread safe if set is called and we have not yet parsed!!
Pipe.releaseReadLock(workingPipe);
return len;
}
public void setValue(Pipe p, long value) {
setValue(p, Pipe.takeByteArrayMetaData((Pipe>) p), Pipe.takeByteArrayLength((Pipe>) p), value);
}
private void setValue(Pipe p, int meta, int length, long value) {
setValue(0, Pipe.byteBackingArray(meta, p), Pipe.bytePosition(meta, p, length), length, Pipe.blobMask(p), value);
}
//since this is an alt one of the 3 alt values must exist
//these are set up so that we prefer the type with the lowest (to the left) index over later ones
private static final int[] captureBytesChoices =
new int[]{TrieParser.TYPE_VALUE_BYTES,TrieParser.TYPE_VALUE_NUMERIC,TrieParser.TYPE_ALT_BRANCH};
private static final int[] captureNumberChoices =
new int[]{TrieParser.TYPE_VALUE_NUMERIC,TrieParser.TYPE_VALUE_BYTES,TrieParser.TYPE_ALT_BRANCH};
private static final int[] definedChoices =
new int[]{TrieParser.TYPE_RUN, TrieParser.TYPE_END, TrieParser.TYPE_SAFE_END, TrieParser.TYPE_BRANCH_VALUE
,TrieParser.TYPE_VALUE_BYTES,TrieParser.TYPE_VALUE_NUMERIC,TrieParser.TYPE_ALT_BRANCH};
private int longestKnown = 0;
private int shortestKnown = Integer.MAX_VALUE;
public final byte caseRuleMask;
public int longestKnown() {
return longestKnown;
}
public int shortestKnown() {
return shortestKnown;
}
private void setValue(int pos, byte[] source, int sourcePos, final int sourceLength, int sourceMask, long value) {
// System.out.print(value+" ");
// Appendables.appendUTF8(System.out, source, sourcePos, sourceLength, sourceMask);
// System.out.println();
//System.out.println("before set of value: "+this.toString());
assert(source.length >= sourceMask || sourceMask==Integer.MAX_VALUE) : "len "+source.length+" mask "+sourceMask;
assert(isValidSize(value));
assert(sourceLength<=source.length);
assert((sourceMask&sourcePos)<=source.length);
activeExtractionCount = 0;//clear this so it can be requested after set is complete.
longestKnown = Math.max(longestKnown, computeMax(source, sourcePos, sourceLength, sourceMask));
shortestKnown = Math.min(shortestKnown, sourceLength);
assert(value >= 0 || value>8)&0xFF;
int trieLen = meta&0xFF;
short v1 = (short) (0xFF&source[sourceMask & sourcePos]);
if (NO_ESCAPE_SUPPORT!=ESCAPE_BYTE && ESCAPE_BYTE==v1 && ESCAPE_BYTE!=source[sourceMask & (1+sourcePos)] ) {
final int sourceLength1 = sourceLength-length;
assert(sourceLength1>=1);
writeEnd(writeRuns(insertAltBranch(0, pos-2, source, sourcePos, sourceLength1, sourceMask), source, sourcePos, sourceLength1, sourceMask), value);
return;
}
v1 &= caseRuleMask;
if (v1 < offset) {
//logger.info("expanded switch on low end");
int growBy = offset-v1;
int requiredRoom = growBy<<1;
int neededLen = limit+requiredRoom;
if (neededLen > data.length) {
growDataLen(neededLen);
}
updatePreviousJumpDistances(0, data, topPos, requiredRoom);
System.arraycopy(data, topPos, data, topPos + requiredRoom, limit - topPos);
limit+=requiredRoom;
//update metadata
int newOffset = v1;
int newTrieLen = trieLen + growBy;
data[metaPos] = (short)((newOffset<<8)|newTrieLen);
Arrays.fill(data, metaPos+1, metaPos+1+requiredRoom, (byte)-1);
int indexJump = limit-(metaPos+(newTrieLen<<1));
data[pos] = (short)(0x7FFF&(indexJump>>15));
data[pos+1] = (short)(0x7FFF&(indexJump));
limit = writeEnd(writeRuns(limit, source, sourcePos, sourceLength-length, sourceMask), value);
return;
}
int dif = v1-offset;
if (dif >= trieLen) {
dif++;
//logger.info("expanded switch on high end");
int growBy = dif-trieLen;
int oldEnd = pos + (trieLen<<1);
int requiredRoom = growBy<<1;
int neededLen = limit+requiredRoom;
if (neededLen > data.length) {
growDataLen(neededLen);
}
updatePreviousJumpDistances(0, data, topPos, requiredRoom);
System.arraycopy(data, oldEnd, data, oldEnd + requiredRoom, limit - oldEnd);
limit+=requiredRoom;
//update metadata
int newOffset = offset;
int newTrieLen = dif;
data[metaPos] = (short)((newOffset<<8)|newTrieLen);
Arrays.fill(data, oldEnd, oldEnd + (growBy<<1) , (byte)-1);
int idx = pos + ((v1-offset)<<1);
int indexJump = limit-(metaPos+(newTrieLen<<1));
data[idx] = (short)(0x7FFF&(indexJump>>15));
data[idx+1] = (short)(0x7FFF&(indexJump));
limit = writeEnd(writeRuns(limit, source, sourcePos, sourceLength-length, sourceMask), value);
return;
}
//jump to new position, all are relative to the end of the jump table so no values need to be
//adjusted if the jump table grows with new inserts.
final int jumpPos = metaPos +1 +(dif<<1);
int indexJump = (((int)data[jumpPos])<<15) | (0x7FFF&data[jumpPos+1]);
if (indexJump == -1) {
//logger.info("new switch value in the middle");
indexJump = limit-(metaPos+(trieLen<<1));
data[jumpPos] = (short)(0x7FFF&(indexJump>>15));
data[jumpPos+1] = (short)(0x7FFF&(indexJump));
limit = writeEnd(writeRuns(limit, source, sourcePos, sourceLength-length, sourceMask), value);
return;
}
// jump to location..
pos = indexJump+metaPos+(trieLen<<1);
}
break;
case TYPE_BRANCH_VALUE:
short v = (short) source[sourceMask & sourcePos];
if (NO_ESCAPE_SUPPORT!=ESCAPE_BYTE && ESCAPE_BYTE==v && ESCAPE_BYTE!=source[sourceMask & (1+sourcePos)] ) {
//we have found an escape sequence so we must insert a branch here we cant branch on a value
final int sourceLength1 = sourceLength-length;
assert(sourceLength1>=1);
int newPos = insertAltBranch(0, pos-1, source, sourcePos, sourceLength1, sourceMask);
writeEnd(writeRuns(newPos, source, sourcePos, sourceLength1, sourceMask), value);
return;
} else {
final int topOfItem = pos-1; //type idx
int tempPos = pos; //mask position
int jumpMask = computeJumpMask((short) v, data[tempPos]);
int rightJump = ((((int)data[1+tempPos])<<15) | (0x7FFF&data[2+tempPos])) & 0xFFFFFF;
pos = 3+tempPos+(jumpMask&rightJump);
int leftPos = tempPos+ 3;
int rightPos = tempPos+ 3 + rightJump;
//////////////////////////////////////
//possible conversion of a single conditional to a switch if this is a simple switch
int leftByte = (0xFF & data[leftPos]);
assert(rightPos data.length) {
growDataLen(neededLen);
}
updatePreviousJumpDistances(0, data, topOfItem, requiredRoom);
System.arraycopy(data, topOfItem, data, topOfItem + requiredRoom, limit - topOfItem);
limit+=requiredRoom;
int newPos = limit;
final int sourceLength1 = sourceLength-length;
limit = writeEnd(writeRuns(newPos, source, sourcePos, sourceLength1, sourceMask), value);
leftPos += (requiredRoom);
rightPos += (requiredRoom);
assert(data[newPos]<8 && data[newPos]>=0) : "bad type:"+data[newPos];
assert(data[leftPos]<8 && data[leftPos]>=0) : "bad type:"+data[leftPos];
assert(data[rightPos]<8 && data[rightPos]>=0) : "bad type:"+data[rightPos];
newPos -= ((topOfItem+1)+(trieLen<<1));
leftPos -= ((topOfItem+1)+(trieLen<<1));
rightPos -= ((topOfItem+1)+(trieLen<<1));
int insertByteBranch = writeSwitch3(TrieParser.TYPE_SWITCH_BRANCH, topOfItem,
sourceByte, leftByte, rightByte,
(short)(newPos>>15), (short)(0x7FFF&newPos),
(short)(leftPos>>15), (short)(0x7FFF&leftPos),
(short)(rightPos>>15), (short)(0x7FFF&rightPos)
);
return;
}
}
break;
case TYPE_ALT_BRANCH:
short w = (short) source[sourceMask & sourcePos];
if (NO_ESCAPE_SUPPORT!=ESCAPE_BYTE && ESCAPE_BYTE==w && ESCAPE_BYTE!=source[sourceMask & (1+sourcePos)] ) {
//new data is an alt so insert on alt side
pos +=2;
} else {
//check the fixed jump side and push the var side for later
int jump = (((int)data[pos++])<<15) | (0x7FFF&data[pos++]);
if (data[pos]!=TYPE_VALUE_NUMERIC) {
//take the far one first (default)
pushAlt(pos, sourcePos);
pos = pos+jump;
} else {
//take this local one first because it is numeric
pushAlt(pos+jump, sourcePos);
}
}
break;
case TYPE_VALUE_NUMERIC:
if (ESCAPE_BYTE==source[sourceMask & sourcePos]) {
byte second = source[sourceMask & (sourcePos+1)];
extractions[activeExtractionCount++] = second;
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
if (isNumber(second)) {
pos++;
length += 2;
sourcePos += 2;
break;
}
}
final int insertLengthNumericCapture = sourceLength-length;
assert(insertLengthNumericCapture>=1);
//this is not a number we are inserting so it goes on the end.
writeEnd(writeRuns(appendAltBranch(pos-1, source, sourcePos, insertLengthNumericCapture, sourceMask), source, sourcePos, insertLengthNumericCapture, sourceMask), value);
return;
case TYPE_VALUE_BYTES:
extractions[activeExtractionCount++] = ESCAPE_CMD_BYTES;
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
if (ESCAPE_BYTE!=source[sourceMask & sourcePos] ||
ESCAPE_CMD_BYTES!=source[sourceMask & (sourcePos+1)] ||
data[pos]!=source[sourceMask & (sourcePos+2)] ) {
////
//this insert puts the new data on the very end instead of inserting it
//in order to ensure the explicit patterns are always found "to the right"
////
final int insertLengthBytesCapture = sourceLength-length;
assert(insertLengthBytesCapture>=1);
writeEnd(writeRuns(appendAltBranch(pos-1, source, sourcePos, insertLengthBytesCapture, sourceMask), source, sourcePos, insertLengthBytesCapture, sourceMask), value);
return;
} else {
pos++;//for the stop consumed
length += 3;//move length forward by count of extracted bytes
sourcePos += 3;
}
break;
case TYPE_RUN:
//run
int runPos = pos++;
final int run = data[runPos];
int r = run;
final int afterWhileRun = run-(sourceLength-length);
if (sourceLength < run+length) {
r = sourceLength-length;
assert(r= 0) {
byte sourceByte = source[sourceMask & sourcePos++];
//found an escape byte, so this set may need to break the run up.
if (ESCAPE_BYTE == sourceByte && NO_ESCAPE_SUPPORT!=ESCAPE_BYTE) {
sourceByte = source[sourceMask & sourcePos++];
//confirm second value is not also the escape byte so we do have a command
if (ESCAPE_BYTE != sourceByte) {
insertAtBranchValueAlt(pos, source, sourceLength, sourceMask, value, length, runPos, run, r+afterWhileRun, sourcePos-2); //TODO: this count can be off by buried extractions.
return;
} else {
sourcePos+=1;//found literal
}
//else we have two escapes in a row therefore this is a literal
}
if (data[pos++] != sourceByte) {
//add switch
insertAtBranchValueByte(pos, source, sourceLength, sourceMask, value, length, runPos, run, r+afterWhileRun, sourcePos-1);
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return;
}
}
length = afterWhileLength;
//matched up to this point but this was shorter than the run so insert a safe point
insertNewSafePoint(pos, source, sourcePos, afterWhileRun, sourceMask, value, runPos);
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return;
}
final byte caseMask = caseRuleMask;
while (--r >= 0) {
byte sourceByte = source[sourceMask & sourcePos++];
if (((caseMask&data[pos]) != (caseMask&0xFF&sourceByte)) && ESCAPE_BYTE == sourceByte && NO_ESCAPE_SUPPORT!=ESCAPE_BYTE) {
//source byte is %
sourceByte = source[sourceMask & sourcePos++];
if (ESCAPE_BYTE != sourceByte) {
//sourceByte holds the specific command
insertAtBranchValueAlt(pos, source, sourceLength, sourceMask, value, length, runPos, run, r, sourcePos-2);
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return;
} else {
//this was %%
sourcePos+=1; //found literal
}
//else we have two escapes in a row therefore this is a literal
}
if ((caseMask&data[pos++]) != (caseMask&0xFF&sourceByte)) {
insertAtBranchValueByte(pos, source, sourceLength, sourceMask, value, length, runPos, run, r, sourcePos-1);
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return;
}
}
length+=run;
break;
case TYPE_END:
if (sourceLength>length) {
convertEndToNewSafePoint(pos, source, sourcePos, sourceLength-length, sourceMask, value);
} else {
writeEndValue(pos, value);
}
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount); //TODO: should this only be for the normal end??
return;
case TYPE_SAFE_END:
if (sourceLength>length) {
///jump over the safe end values and continue on
pos += SIZE_OF_RESULT;
break;
} else {
pos = writeEndValue(pos, value);
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return;
}
default:
logger.info("unknown op {}",this);
throw new UnsupportedOperationException("unknown op "+type+" at "+(pos-1));
}
//hold this parent for downstream items
parentType = topPos;
}
} else {
//Start case where we insert the first run;
pos = writeRuns( pos, source, sourcePos, sourceLength, sourceMask);
limit = Math.max(limit, writeEnd(pos, value));
}
}
private boolean isValidSize(long value) {
int actualBits = (int)Math.ceil(Math.log(value)/Math.log(2));
int maxBits = 16*SIZE_OF_RESULT;
if (actualBits>maxBits) {
logger.warn("This TrieParser was created to hold max values of {} bits but was passed {} which requires {}"
,maxBits,value,actualBits);
return false;
}
return true;
}
private boolean isNumber(byte second) {
return ESCAPE_CMD_UNSIGNED_INT==second || ESCAPE_CMD_UNSIGNED_HEX==second ||
ESCAPE_CMD_DECIMAL==second || ESCAPE_CMD_RATIONAL==second ||
ESCAPE_CMD_OPTIONAL_SIGNED_INT==second ||
ESCAPE_CMD_SIGNED_INT==second || ESCAPE_CMD_SIGNED_HEX==second;
}
private int computeMax(byte[] source, int pos, int len, int mask) {
//int values can be long and we follow the same limits as the parser
int total = 0;
int i = len;
boolean escapeDetected = false;
while (--i>=0) {
byte value = source[mask & pos++];
if (ESCAPE_BYTE == value) {
//if we have escape we turn it off, if off we turn it on
escapeDetected = !escapeDetected;
if (!escapeDetected) {
total++;
}
} else {
if (escapeDetected) {
if (ESCAPE_CMD_BYTES == value) {
total += maxBytesCapturable;
} else if (ESCAPE_CMD_DECIMAL == value ||
ESCAPE_CMD_RATIONAL == value ||
ESCAPE_CMD_SIGNED_INT == value ||
ESCAPE_CMD_SIGNED_HEX == value ||
ESCAPE_CMD_OPTIONAL_SIGNED_INT == value ||
ESCAPE_CMD_UNSIGNED_INT == value ||
ESCAPE_CMD_UNSIGNED_HEX == value
) {
total += maxNumericLenCapturable;
} else {
total++;
}
} else {
total++;
}
}
}
return total;
}
public void setMaxBytesCapturable(int value) {
maxBytesCapturable = value;
}
public void setMaxNumericLengthCapturable(int value) {
maxNumericLenCapturable = value;
}
void recurseAltBranch(int pos, int offset) {
if (data[pos] == TrieParser.TYPE_ALT_BRANCH) {
pos++;
assert(data[pos]>=0): "bad value "+data[pos];
assert(data[pos+1]>=0): "bad value "+data[pos+1];
altBranch( pos, offset, (((int)data[pos++])<<15) | (0x7FFF&data[pos++]), data[pos]);
} else {
pushAlt(pos, offset);
}
}
void altBranch(int pos, int offset, int jump, int peekNextType) {
assert(jump>0) : "Jump must be postitive but found "+jump;
//put extract first so its at the bottom of the stack
if (TrieParser.TYPE_VALUE_BYTES == peekNextType
|| TrieParser.TYPE_VALUE_NUMERIC==peekNextType
|| TrieParser.TYPE_ALT_BRANCH==peekNextType) {
//Take the Jump value first, the local value has an extraction.
//push the LocalValue
recurseAltBranch(pos+ TrieParser.BRANCH_JUMP_SIZE, offset);
recurseAltBranch(pos+jump+ TrieParser.BRANCH_JUMP_SIZE, offset);
} else {
//Take the Local value first
//push the JumpValue
recurseAltBranch(pos+jump+ TrieParser.BRANCH_JUMP_SIZE, offset);
recurseAltBranch(pos+ TrieParser.BRANCH_JUMP_SIZE, offset);
}
}
private void pushAlt(int pos, int sourcePos) {
altStackA[altStackPos] = pos;
altStackB[altStackPos++] = sourcePos;
}
static short buildNumberBits(byte sourceByte) {
switch(sourceByte) {
case ESCAPE_CMD_OPTIONAL_SIGNED_INT:
return (short)(TrieParser.NUMERIC_FLAG_SIGN | TrieParser.NUMERIC_FLAG_ABSENT_IS_ZERO);
case ESCAPE_CMD_SIGNED_INT:
return TrieParser.NUMERIC_FLAG_SIGN;
case ESCAPE_CMD_UNSIGNED_INT:
return 0;
case ESCAPE_CMD_SIGNED_HEX:
return TrieParser.NUMERIC_FLAG_HEX | TrieParser.NUMERIC_FLAG_SIGN;
case ESCAPE_CMD_UNSIGNED_HEX:
return TrieParser.NUMERIC_FLAG_HEX;
case ESCAPE_CMD_DECIMAL:
return TrieParser.NUMERIC_FLAG_DECIMAL;
case ESCAPE_CMD_RATIONAL:
return TrieParser.NUMERIC_FLAG_SIGN | TrieParser.NUMERIC_FLAG_RATIONAL;
default:
throw new UnsupportedOperationException("Unsupported % operator found '"+((char)sourceByte)+"'");
}
}
private void convertEndToNewSafePoint(int pos, byte[] source, int sourcePos, int sourceLength, int sourceMask, long value) {
//convert end to safe, pos is now at the location of SIZE_OF_RESULT data
if (data[pos-1] != TYPE_END) {
throw new UnsupportedOperationException();
}
data[--pos] = TYPE_SAFE_END; //change to a safe and move pos back to beginning of this.
//now insert the needed run
int requiredRoom = SIZE_OF_END_1 + sourceLength + midRunEscapeValuesSizeAdjustment(source, sourcePos, sourceLength, sourceMask);
//TODO: bad req room, add to end instead we do not know right length...
makeRoomForInsert(0, pos, requiredRoom); //after the safe point we make room for our new run and end
pos += SIZE_OF_SAFE_END;
pos = writeRuns(pos, source, sourcePos, sourceLength, sourceMask);
pos = writeEnd(pos, value);
}
/**
* Compute the additional space needed for any value extraction meta command found in the middle of a run.
*/
@Deprecated //TODO: Must remove all usages of this and instead add these to the end so we need not know the length.
private int midRunEscapeValuesSizeAdjustment(byte[] source, int sourcePos, int sourceLength, int sourceMask) {
if (0==sourceLength) {
return 0;
}
// new Exception(Appendables.appendUTF8(new StringBuilder("hello "), source, sourcePos, sourceLength, sourceMask).toString()).printStackTrace();
int adjustment = 0;
boolean needsRunStart = true;
for(int i=0;i=source.length) {
break;//stop here
}
byte value = source[idx];
// System.err.println(i+" "+((char)value));
if (ESCAPE_BYTE == value && NO_ESCAPE_SUPPORT!=ESCAPE_BYTE) {
i++;
value = source[sourceMask & (sourcePos+i)];
if (ESCAPE_BYTE != value) {
if (ESCAPE_CMD_BYTES == value) { //%bX
if ((sourceLength>2) && (i=1);
writeEnd(writeRuns(insertAltBranch(0, pos>=3 ? pos-3 : 0, source, sourceCharPos, insertLength, sourceMask), source, sourceCharPos, insertLength, sourceMask), value);
} else {
final int insertLength = sourceLength - (length+(data[runPos] = (short)(run-r1)));
assert(insertLength>=1);
writeEnd(writeRuns(insertAltBranch(r1, pos, source, sourceCharPos, insertLength, sourceMask), source, sourceCharPos, insertLength, sourceMask), value);
}
}
private void insertAtBranchValueByte(final int pos, byte[] source, int sourceLength, int sourceMask,
long value, int length, int runPos, int run, int r1, final int sourceCharPos) {
r1++;
if (r1 == run) {
final int sourceLength1 = sourceLength - length;
assert(sourceLength1>=1);
int insertByteBranch = insertBranch(0, pos>=3 ? pos-3 : 0, source, sourceCharPos, sourceLength1, sourceMask);
writeEnd(writeRuns(insertByteBranch, source, sourceCharPos, sourceLength1, sourceMask), value);
} else {
final int sourceLength1 = sourceLength - (length+(data[runPos] = (short)(run-r1)));
assert(sourceLength1>=1);
int insertByteBranch = insertBranch(r1, pos-1, source, sourceCharPos, sourceLength1, sourceMask);
writeEnd(writeRuns(insertByteBranch, source, sourceCharPos, sourceLength1, sourceMask), value);
}
}
private int insertBranch(int danglingByteCount, int pos, byte[] source, final int sourcePos, final int sourceLength, int sourceMask) {
final int requiredRoom = SIZE_OF_END_1 + SIZE_OF_BRANCH + sourceLength + midRunEscapeValuesSizeAdjustment(source, sourcePos, sourceLength, sourceMask);
//TODO: bad req room, add to end instead we do not know right length...
final int oldValueIdx = makeRoomForInsert(danglingByteCount, pos, requiredRoom);
return writeBranch(TYPE_BRANCH_VALUE, pos, requiredRoom, findSingleBitMask((short) source[sourcePos & sourceMask], this.data[oldValueIdx]));
}
/**
* Inserts run with end at this point. Moves data down to make room for this run.
* The alternate case will jump over this run and continue as normal.
*
* @param danglingByteCount
* @param pos
* @param source
* @param sourcePos
* @param sourceLength
* @param sourceMask
*/
private int insertAltBranch(int danglingByteCount, int pos, byte[] source, final int sourcePos, final int sourceLength, int sourceMask) {
int requiredRoom = SIZE_OF_END_1 + SIZE_OF_ALT_BRANCH + sourceLength + midRunEscapeValuesSizeAdjustment(source, sourcePos, sourceLength, sourceMask);
//TODO: bad req room, add to end instead we do not know right length...
makeRoomForInsert(danglingByteCount, pos, requiredRoom);
requiredRoom -= SIZE_OF_ALT_BRANCH;//subtract the size of the branch operator
data[pos++] = TYPE_ALT_BRANCH;
//TODO: type alt branch needs 2 jumps so we can add new run to the end
data[pos++] = (short)(0x7FFF&(requiredRoom>>15));
data[pos++] = (short)(0x7FFF&requiredRoom);
return pos;
}
//need insert all which keeps the normal run and jumps all the way to the end where we will put the new run.
private int appendAltBranch(int pos, byte[] source, final int sourcePos, final int sourceLength, int sourceMask) {
final int requiredRoom = SIZE_OF_ALT_BRANCH;
final int toBeMoved = limit - pos;
limit+=requiredRoom;
int roomToGrow = SIZE_OF_END_1 + sourceLength + midRunEscapeValuesSizeAdjustment(source, sourcePos, sourceLength, sourceMask);
if (limit+toBeMoved+roomToGrow > data.length) {
growDataLen(limit+toBeMoved+roomToGrow);
}
if (toBeMoved <= 0) {
//nothing to be moved
} else {
updatePreviousJumpDistances(0, data, pos, requiredRoom);
assert(pos>=0);
System.arraycopy(data, pos, data, pos + requiredRoom, toBeMoved);
}
data[pos++] = TYPE_ALT_BRANCH;
//this will jump to the end, we are assuming that some later call will add something to that point
data[pos++] = (short)(0x7FFF&(toBeMoved>>15));
data[pos++] = (short)(0x7FFF&toBeMoved);
int target = limit;
limit +=roomToGrow;
return target;
}
private short findSingleBitMask(short a, short b) {
int mask = 1<<5; //default of sign bit, only used when nothing replaces it. (critical for case insensitivity)
for(int i=0; i<8; i++) {
if (5!=i) { //sign bit, we do not use it unless all the others are tested first
int localMask = 1 << i;
if ((localMask&a) != (localMask&b)) {
mask = localMask;
break;
}
}
}
short result = (short)(( 0xFF00&((mask&b)-1) ) | mask); //high byte is on when A matches mask
assert((result&a) != (result&b));
return result;
}
private void makeRoom(int pos, int requiredRoom) {
int neededLen = limit+requiredRoom;
if (neededLen > data.length) {
growDataLen(neededLen);
}
updatePreviousJumpDistances(0, data, pos, requiredRoom);
System.arraycopy(data, pos, data, pos + requiredRoom, limit - pos);
}
@Deprecated
private int makeRoomForInsert(int danglingByteCount, int pos, int requiredRoom) {
final int toBeMoved = limit - pos;
if (danglingByteCount > 0) {
requiredRoom+=SIZE_OF_RUN; //added because we will prepend this with a TYPE_RUN header to close the dangling bytes
}
int neededLen = limit+requiredRoom+SIZE_OF_RUN;
if (neededLen > data.length) {
growDataLen(neededLen);
}
limit+=requiredRoom;
int newPos;
if (toBeMoved <= 0) {
newPos = pos;//nothing to be moved
} else {
updatePreviousJumpDistances(0, data, pos, requiredRoom);
newPos = pos + requiredRoom;
assert(pos>=0);
System.arraycopy(data, pos, data, newPos, toBeMoved);
if (danglingByteCount > 0) {//do the prepend because we now have room
data[newPos-2] = TYPE_RUN;
data[newPos-1] = (short)danglingByteCount;
} else {
//new position already has the start of run so move cursor up to the first data point
newPos+=SIZE_OF_RUN;
}
}
return newPos;
}
private void growDataLen(int neededLen) {
if (this.fixedSize) {
throw new UnsupportedOperationException("allocated length of "+data.length+" is too short to add all the patterns");
} else {
int newLen = data.length*2;
if (newLen < neededLen) {
newLen = neededLen;
}
short[] newData = new short[newLen];
System.arraycopy(data, 0, newData, 0, data.length);
data = newData;
}
}
private void updatePreviousJumpDistances(int i, short[] data, int localLimit, int requiredRoom) {
while (i>8)&0xFF;
int trieLen = meta&0xFF;
for(int k = 0; k=0) {
if (jmp+metaPos+(trieLen<<1) > localLimit) {
jmp += requiredRoom;
data[i] = (short)(0x7FFF&(jmp>>15));
data[i+1] = (short)(0x7FFF&(jmp));
}
}
i+=2;
}
break;
case TYPE_SAFE_END:
i += SIZE_OF_SAFE_END;
break;
case TYPE_BRANCH_VALUE:
{
int jmp = (((int)data[i+2]) << 15)|(0x7FFF&data[i+3]);
int newPos = SIZE_OF_BRANCH+i+jmp;
if (newPos > localLimit) {
//System.err.println("byte jmp "+ jmp+" adjusted to new jump of "+(jmp+requiredRoom));
//System.err.println("byte jmp target "+ (i+4+jmp)+" adjusted to new jump targe of "+(i+4+jmp+requiredRoom));
//adjust this value because it jumps over the new inserted block
jmp += requiredRoom;
data[i+2] = (short)(0x7FFF&(jmp>>15));
data[i+3] = (short)(0x7FFF&(jmp));
}
i += SIZE_OF_BRANCH;
}
break;
case TYPE_ALT_BRANCH:
{
int jmp = (((int)data[i+1]) << 15)|(0x7FFF&data[i+2]);
int newPos = SIZE_OF_ALT_BRANCH+i+jmp;
if (newPos > localLimit) {
//System.err.println("alt jmp "+jmp+" adjusted to new jump of "+(jmp+requiredRoom));
//adjust this value because it jumps over the new inserted block
jmp += requiredRoom;
data[i+1] = (short)(0x7FFF&(jmp>>15));
data[i+2] = (short)(0x7FFF&(jmp));
}
i += SIZE_OF_ALT_BRANCH;
}
break;
case TYPE_VALUE_NUMERIC:
i += SIZE_OF_VALUE_NUMERIC;
break;
case TYPE_VALUE_BYTES:
i += SIZE_OF_VALUE_BYTES;
break;
case TYPE_RUN:
assert(data[i+1] >= 0) : "run length must be positive but we found "+data[i+1]+" at position "+i;
i = i+SIZE_OF_RUN+data[i+1];
break;
case TYPE_END:
i += SIZE_OF_END_1;
break;
default:
logger.info("unknown op {}",this);
throw new UnsupportedOperationException("ERROR Unrecognized value "+data[i]+" at "+i);
}
}
}
private int writeSwitch3(byte type, int pos,
int b1, int b2, int b3,
short b1jumpHigh, short b1jumpLow,
short b2jumpHigh, short b2jumpLow,
short b3jumpHigh, short b3jumpLow
) {
int offset = Math.min(b1, Math.min(b2, b3));
int trieLen = 1+Math.max(b1, Math.max(b2, b3))-offset;
final int sizeOf = (2+(2*trieLen));
int neededLen = pos+sizeOf;
if (neededLen>data.length) {
growDataLen(neededLen);
}
data[pos++] = type;
data[pos++] = (short)((offset<<8)|trieLen);
int base = pos;
int t = trieLen;
while (--t>=0) {
data[pos++] = -1;
data[pos++] = -1;
}
b1= (b1-offset)*2;
b2= (b2-offset)*2;
b3= (b3-offset)*2;
data[b1+base] = b1jumpHigh;
data[1+b1+base] = b1jumpLow;
data[b2+base] = b2jumpHigh;
data[1+b2+base] = b2jumpLow;
data[b3+base] = b3jumpHigh;
data[1+b3+base] = b3jumpLow;
return pos;
}
private int writeBranch(byte type, int pos, int requiredRoom, short criteria) {
int neededLen = pos+4;
if (neededLen>data.length) {
growDataLen(neededLen);
}
requiredRoom -= SIZE_OF_BRANCH;//subtract the size of the branch operator
data[pos++] = type;
data[pos++] = criteria;
data[pos++] = (short)(0x7FFF&(requiredRoom>>15));
data[pos++] = (short)(0x7FFF&requiredRoom);
return pos;
}
private int writeEnd(int pos, long value) {
int neededLen = pos+1+SIZE_OF_RESULT;
if (neededLen>data.length) {
growDataLen(neededLen);
}
data[pos++] = TYPE_END;
return writeEndValue(pos, value);
}
private int writeEndValue(int pos, long value) {
int s = SIZE_OF_RESULT;
while (--s >= 0) {
data[pos++] = (short)(0xFFFF& (value>>(s<<4)));
}
return pos;
}
static long readEndValue(short[] data, int pos, int resultSize) {
if (resultSize<=2) {
if (resultSize == 2) {
//2 -- most common choice
return (((int)data[pos])<<16) | (0xFFFFL & data[1+pos]);
} else {
//1
return data[pos];
}
} else {
if (resultSize == 3) {
//3
return (((long)data[pos])<<32)
| ((0xFFFFL & data[1+pos])<<16)
| (0xFFFFL & data[2+pos]);
} else {
//4
return (((long)data[pos])<<48)
| ((0xFFFFL & data[1+pos])<<32)
| ((0xFFFFL & data[2+pos])<<16)
| (0xFFFFL & data[3+pos]);
}
}
}
private int writeBytesExtract(int pos, short stop) {
int neededLen = pos+2;
if (neededLen>data.length) {
growDataLen(neededLen);
}
data[pos++] = TYPE_VALUE_BYTES;
data[pos++] = stop;
extractions[activeExtractionCount++] = ESCAPE_CMD_BYTES;
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return pos;
}
private int writeNumericExtract(int pos, int type) {
int neededLen = pos+2;
if (neededLen>data.length) {
growDataLen(neededLen);
}
data[pos++] = TYPE_VALUE_NUMERIC;
data[pos++] = buildNumberBits((byte)type);
extractions[activeExtractionCount++] = (byte)type;
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return pos;
}
private int writeRuns(int pos, byte[] source, int sourcePos, int sourceLength, int sourceMask) {
if (sourceLength<=0) {
return pos;
}
//check for room first.
int neededLen = pos+sourceLength+SIZE_OF_RUN;
if (neededLen>data.length) {
growDataLen(neededLen);
}
assert(ESCAPE_BYTE != source[sourceMask & (sourcePos+sourceLength-1)]) : "Escape byte is always followed by something and can not be last.";
pos = writeRunHeader(pos, sourceLength);
int runLenPos = pos-1;
int runLeft = sourceLength;
int sourceStop = sourceLength+sourcePos;
short activeRunLength = 0;
while (--runLeft >= 0) {
short value = (short)(0xFF&source[sourceMask & sourcePos++]);
if (ESCAPE_BYTE == value && NO_ESCAPE_SUPPORT!=ESCAPE_BYTE) {
value = source[sourceMask & sourcePos++];
if (ESCAPE_BYTE != value) {
//new command so we must stop the run at this point
if (activeRunLength > 0) {
data[runLenPos]=activeRunLength; //this run has ended so we must set the new length.
} else {
//wipe out run because we must start with extraction
pos = runLenPos-1;
}
if (ESCAPE_CMD_BYTES == value) {
byte stop = sourcePos 0) {
pos = writeRuns(pos, source, sourcePos, remainingLength, sourceMask);
}
} else {
pos = writeNumericExtract(pos, value);
int remainingLength = runLeft-1;
if (remainingLength > 0) {
pos = writeRuns(pos, source, sourcePos, remainingLength, sourceMask);
}
}
maxExtractedFields = Math.max(maxExtractedFields, activeExtractionCount);
return pos;
} else {
//do NOT add this value a second time here.
activeRunLength++;
//literal so jump over the second instance
sourcePos++;
}
}
data[pos++] = value;
activeRunLength++;
}
return pos;
}
private int writeRunHeader(int pos, int sourceLength) {
int neededLen = pos+2;
if (neededLen>data.length) {
growDataLen(neededLen);
}
if (sourceLength > 0x7FFF || sourceLength < 1) {
throw new UnsupportedOperationException("does not support strings beyond this length "+0x7FFF+" value was "+sourceLength);
}
data[pos++] = TYPE_RUN;
data[pos++] = (short)sourceLength;
return pos;
}
public void setValue(byte[] bytes, long value) {
setValue(bytes, 0, bytes.length, Integer.MAX_VALUE, value);
}
public void toDOTFile(File targetFile) {
try {
String filename = targetFile.getAbsolutePath();
System.out.println("dot -Tsvg -o"+filename+".svg "+filename);
PrintStream printStream = new PrintStream(targetFile);
toDOT(printStream);
printStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy