All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ociweb.pronghorn.util.CharSequenceHashBuilder Maven / Gradle / Ivy

The newest version!
package com.ociweb.pronghorn.util;

import java.util.Arrays;
import java.util.Comparator;

public class CharSequenceHashBuilder {
    
    //TODO: add a reset method to use this again
    private final LengthComparitor lengthComparitor = new LengthComparitor();
    private final int[] lengthCountSpace = new int[32+1]; //lengths are 32bit integers
    private static final int   MAX_VALUE_IDX = 32;//the index where we find the max value
    private final int[] largestGroup = new int[32];
    private byte[] shiftsStack = new byte[32];
    private int    shiftStackDepth = 0;
    private int[]  charHashSeed = new int[32];
    private int    charHashSeedDepth = 0;
    
    
    public CharSequenceHashBuilder() {        
    }
    

    
    private boolean recursiveBalanceScan(byte[] shifts, int depth, int primaryMask, int[] values, int base, int[] largeGroupCount, int[] countSpace) {
                
        int idx = depth - 1;     
        //System.out.println("idx "+idx+" base "+Integer.toBinaryString(base));
        if (idx < 0) {
            //use this mask and test this case            
            boolean needsMoreSplitting = countSplitBalancePerBit(values, primaryMask, base, countSpace);
            if (needsMoreSplitting) {
                accumulateLargestGroups(countSpace, largeGroupCount);
            }
            return needsMoreSplitting; //go no deeper
            
        }
        
        return recursiveBalanceScan(shifts, idx, primaryMask, values, base, largeGroupCount, countSpace) | //with zero for the mask
               recursiveBalanceScan(shifts, idx, primaryMask, values, base | (1<= 0) {
            if (filterCheck == (filterMask&values[i])) {
                maxTotal++; //count total members of this group
                sumEachBit(filterMask, countSpace, values[i], countSpace.length-1);
            }            
       }
       countSpace[countSpace.length-1] = maxTotal;
        
       return maxTotal > 1;
    }

    private void sumEachBit(int filterMask, int[] countSpace, int length, int j) {
        while (--j >= 0) {
            //only consider bits that have not already been chosen
            if (0==(filterMask&(1<> j)); //counting the 1 bits
            }
        }
    }
    
    private void accumulateLargestGroups(int[] localCountSpace, int[] target) {
        
        int k = localCountSpace.length-1;
        int all = localCountSpace[k];
        while (--k >= 0) {
                int biggerSide = Math.max(all-localCountSpace[k], localCountSpace[k]);  
                target[k] = (int) Math.max(target[k], biggerSide); 
        }

    }
    
   
    
    private int findIdxOfSmallestGroup(int[] largestGroupSizes, int maxValue, int alreadyPicked) {

        int result = -1;
        int k = largestGroupSizes.length;
        while (--k >= 0) {            
                int groupSize = largestGroupSizes[k];                
                if (groupSize >= 1 && 
                    groupSize <= maxValue &&
                    (0==((1<=0) {
                
                accumMask |= (1<=0) {
            if (values[m].length()>maxValueLen) {
                maxValueLen = values[m].length();
            }
        }
        maxValueLen = maxValueLen<=0) {
                Arrays.fill(largestGroup, 0);
                
                needsMoreSplitting = recursiveBalanceScan(values, shiftsStack, shiftStackDepth, 0, shiftsStackAsBits, 
                                                          charHashSeed, -1, -1, 0, largestGroup, lengthCountSpace, mixBits, sigBits);
                
                //extracting the length bits
                bitShift = (byte)findIdxOfSmallestGroup(largestGroup, (lastGroupSize>>1)+1, shiftsStackAsBits);
                
                if (bitShift>=0) {
                    
                    int groupSize = largestGroup[bitShift];
                    
                    int newBit = 1<=0) {
                    
                    int groupSize = charLenCountGroups[seedIdx];
                    
                    charHashSeed[charHashSeedDepth++] = seedIdx;
                    lastGroupSize = groupSize;
                    
                    if (1==groupSize) {
                        needsMoreSplitting = false;
                        break;
                    }
                    
                    
                } else {
                   break;                  
                }
                
            }
            
            
            
        } while (true);
        
        return new GeneratedHashRules(needsMoreSplitting, shiftsStackAsBits, shiftsStack, shiftStackDepth, charHashSeed, charHashSeedDepth);
      
    }
    

    private boolean countSplitBalancePerBitA(CharSequence[] values, int filterMaskForLen, int filterCheckForLen, 
                                            int[] charSeeds, int charSeedsDepth, int charSeedsFilter, int[] countSpace, int mixBits, int sigBits) {
        Arrays.fill(countSpace, 0);
        
        int i = values.length;
        int maxTotal = 0;
        
        int commonLength = -1;
        boolean allSame = true;
        while (--i >= 0) {
            CharSequence value = values[i];
            int length = value.length();            
            if (filterCheckForLen == (filterMaskForLen&length)) {
                
                    if (commonLength<0) {
                        commonLength=length;
                    } else {
                        allSame &= (commonLength==length);
                    }
                
                    maxTotal++; //count total members of this group
                    sumEachBit(filterMaskForLen, countSpace, length, countSpace.length-1);
            }            
       }
       countSpace[countSpace.length-1] = maxTotal;
       return !allSame;
    }
    
    private boolean countSplitBalancePerBitB(CharSequence[] values, int filterMaskForLen, int filterCheckForLen,
            int[] charSeeds, int charSeedsDepth, int charSeedsFilter, int[] countSpace, int mixBits, int sigBits) {
        Arrays.fill(countSpace, 0);

        // these two are constants for the hash.
        // run once with 0, 0 and grow them as needed?

        final int mixerShift = mixBits; // NOTE: selector should favor 0
        final int mixerMask = (1 << mixBits) - 1; // can adjust bigger for
                                                  // sparse similar values, or
                                                  // smaller

        if (mixerMask!=0) {
            throw new UnsupportedOperationException();
        }
        
        final int cBitShift = sigBits; // NOTE:selector should favor 0
        final int cBitsMask = (1 << sigBits) - 1; // can adjust smaller or
                                                  // bigger

        int i = values.length;
        int totalElementsInSelectedGroup = 0;
        while (--i >= 0) {
            CharSequence value = values[i];
            int length = value.length();
            if (filterCheckForLen == (filterMaskForLen & length)) {
                // allow everything except defined char seeds that do not match
                if (isApplicable(mixerShift, mixerMask, cBitShift, cBitsMask, value, length, charSeeds, charSeedsDepth, charSeedsFilter)) {
                    totalElementsInSelectedGroup++; // count total members of this group
                    
                    // count space dictates how many values will be tried
                    int n = countSpace.length - 1;// save last value for total
                                                  // size of group
                    while (--n >= 0) {
                        countSpace[n] += charSplitBit(mixerShift, mixerMask, cBitShift, cBitsMask, n, value, length);
                    }

                }
            }
        }

        countSpace[countSpace.length - 1] = totalElementsInSelectedGroup;
    
    return totalElementsInSelectedGroup>1;
    }

    private boolean isApplicable(final int mixerShift, final int mixerMask, final int cBitShift, final int cBitsMask, CharSequence value, int length, int[] charSeeds, int charSeedsLength, int charSeedsFilter) {
        boolean applicable = true;
        int c = charSeedsLength;
        
        while (--c >= 0 && applicable) {            
            int checkSeedBit = 1&(charSeedsFilter>>c);//expecting 1 or 0            
            applicable &= (checkSeedBit == charSplitBit(mixerShift, mixerMask, cBitShift, cBitsMask, charSeeds[c], value, length));
        }

        return applicable;
    }

    private int charSplitBit(final int mixerShift, final int mixerMask, final int cBitShift, final int cBitsMask, int n, CharSequence cs, int len) {
        return splitBit(mixerMask, cBitShift, cBitsMask, n, cs, len, n >> mixerShift);
    }

    private int splitBit(final int mixerMask, final int cBitShift, final int cBitsMask, int n, CharSequence cs, int len, int bitIndx) {
        return splitBit(cs, len, bitIndx & cBitsMask, bitIndx >> cBitShift, 0, (n & mixerMask)+1);
    }

    private int splitBit(CharSequence cs, int len, int bitShift, int cPos, int c, int mixCount) {
        int t = mixCount;
        while (--mixCount >= 0){
            c += cs.charAt((mixCount+cPos) % len);
        }                 
        return 1 & (c>>bitShift);
    }
    

    private boolean recursiveBalanceScan(final CharSequence[] values, 
                                          byte[] lengthShifts, final int lengthShifsDepth, int lengthShiftGroupFilter, int lengthShiftsInBitFormat,
                                          int[] charSeeds,     int charSeedsDepth, int charSeedCountDown, int charSeedsGroupFilter, int[] largeGroupCount, int[] countSpace,
                                          int mixBits, int sigBits) {
                
        int lengthFilterLeft  = lengthShiftGroupFilter;
        int lengthFitlerRight = lengthShiftGroupFilter;
        
        int charSeedsGroupFilterLeft = charSeedsGroupFilter;
        int charSeedsGroupFilterRight = charSeedsGroupFilter;
        
    //    System.out.println(lengthShifsDepth+" "+charSeedsDepth);
        
        int charSeedFilter = charSeedCountDown;
        
        final int lengthBitFilter = lengthShifsDepth - 1;     
        if (lengthBitFilter < 0) {
         //   lengthBitFilter = lengthShifsDepth;
            //we have reached the bottom of the length masks, the lengthShiftGroupFilter is as detailed as it will get.
                        
            //are we at the true bottom or continuing of the chars
          //  charSeedFilter--;
            charSeedFilter--;
            if (charSeedFilter < 0 ) {
                
                //this is the true bottom of both lengths and charSeeds

                //use this mask and test this case            
                boolean needsSplit;
                
                if (-1==charSeedsDepth) {
                    needsSplit = countSplitBalancePerBitA(values, lengthShiftsInBitFormat, lengthShiftGroupFilter, charSeeds, charSeedsDepth, charSeedsGroupFilter, countSpace, mixBits, sigBits);
                } else {
                    needsSplit = countSplitBalancePerBitB(values, lengthShiftsInBitFormat, lengthShiftGroupFilter, charSeeds, charSeedsDepth, charSeedsGroupFilter, countSpace, mixBits, sigBits);
                }
                
                //only review those that need splitting
                if (needsSplit) {
                    accumulateLargestGroups(countSpace, largeGroupCount);
                }
                
                return needsSplit;                
                
            } else {
               //charSeedsGroupFilterLeft |= 0; //nothing to do
               charSeedsGroupFilterRight |= (1<= 0) {
            
            
            CharSequence value = values[i];
            //divide by length first then use bits.
            
            if (isValueSelected(value)) {
                //find which bit most evenly splits the group.
                //store the lengths as well
                
                //sorted list of lenghts? find the middle?
                
                
                
            }
            
            
            
            
            
            
            
            
        }
        
        
    }


    private static boolean isValueSelected(CharSequence value) {
        
        //TODO: needs the extra args to check the full stack.
        
        return true;
    }
    
    
    public static class LengthComparitor implements Comparator {

        @Override
        public int compare(CharSequence o1, CharSequence o2) {
            return Integer.compare(o1.length(), o2.length());
        }
        
        
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy