All Downloads are FREE. Search and download functionalities are using the official Maven repository.

test.TestScannerPerf Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
package test;

import java.io.*;

public final class TestScannerPerf
{
    final static int INT_AMP = '&';
    final static int INT_LT = '<';
    final static int INT_RBRACKET = ']';

    final static int INT_SPACE = ' ';
    final static int INT_TAB = '\t';
    final static int INT_CR = '\r';
    final static int INT_LF = '\n';

    final static byte BYTE_LF = (byte) '\n';

    final static byte BYTE_NULL = (byte)0;

    final int mRepCount;

    int mTmpChar = 0;

    final byte[] mData;

    final byte[] mInputBuffer = new byte[4000];

    final char[] mOutputBuffer = new char[2000];

    final static int MB_CODE_BASE = 5;

    final static int[] CHAR_TYPES = new int[256];
    static {
        int code;
        for (int i = 128; i < 256; ++i) {
            int c = i;
            if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                code = MB_CODE_BASE + 1;
            } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                code = MB_CODE_BASE + 2;
            } else if ((c & 0xF8) == 0xF0) {
                // 4 bytes; double-char with surrogates and all...
                code = MB_CODE_BASE + 3;
            } else {
                code = 1;
            }
            CHAR_TYPES[c] = code;
        }
        for (int i = 0; i < 32; ++i) {
            CHAR_TYPES[i] = 1; // invalid white space
        }
        CHAR_TYPES['\r'] = 2;
        CHAR_TYPES['\n'] = 2;
        CHAR_TYPES['\t'] = 0; // no processing needed
        CHAR_TYPES['<'] = 3;
        CHAR_TYPES['&'] = 4;
        CHAR_TYPES[']'] = 5;
    }

    InputStream mIn;

    int mLineNr;
    int mByteCount;
    int mTagCount;
    int mEntityCount;
    int mBracketCount;

    int mInputPtr;

    int mInputLen;

    int mTmpType = 0;

    public TestScannerPerf(byte[] data, int repCount)
    {
        mData = data;
        mRepCount = repCount;
    }

    public void test()
        throws IOException
    {
       int round = 0;
       mIn = new ByteArrayInputStream(mData);

       for (; true; ++round) {
           long now = System.currentTimeMillis();
           String msg = "[null]";
           int total = 0;

           final int TYPES = 3;

           if ((round % TYPES) == 0) {
               System.out.println();
           }

           for (int i = 0; i < mRepCount; ++i) {
               mIn.reset();

               mLineNr = 0;
               mTagCount = 0;
               mByteCount = 0;

               switch (round % TYPES) {
               case 0:
                   msg = "[Scanner-code]";
                   total += testScannerCode();
                   break;
               case 1:
                   msg = "[Scanner-int-arr]";
                   total += testScannerInts();
                   break;
               case 2:
                   msg = "[Scanner-int-arr2]";
                   total += testScannerInts2();
                   break;
               default:
                   throw new Error("Unexpected round, #"+i);
               }
           }

           now = System.currentTimeMillis() - now;
           System.out.println(msg+" -> "+now+" msecs (total "+total
                              +", byte count 0x"+Integer.toHexString(mByteCount)+")");

           try { Thread.sleep(200L); } catch (Exception e) { }
           System.gc();
           try { Thread.sleep(200L); } catch (Exception e) { }
       }
    }

    private int testScannerCode()
        throws IOException
    {
        final char[] outBuf = mOutputBuffer;
        int outPtr = 0;
        int c = 0;

        mInputLen = 0;
        mInputPtr = 0;

        main_loop:
        while (true) {
            // Next thing: let's get the first byte:
            int ptr = mInputPtr;

            ascii_loop:
            while (true) {
                if (ptr >= mInputLen) {
                    if (!loadMoreBytes()) {
                        break main_loop;
                    }
                    ptr = mInputPtr;
                }
                c = (int) mInputBuffer[ptr++];
                if (c <= INT_RBRACKET) {
                //if (c <= INT_LT) {
                    if (c < 0) {
                        break ascii_loop;
                    }
                    if (c < INT_SPACE) {
                        if (c == INT_CR) {
                            ++mLineNr;
                        } else if (c == INT_LF) {
                            ++mLineNr;
                        } else if (c != INT_TAB) {
                            throw new Error();
                        }
                    } else if (c == INT_LT) {
                        ++mTagCount;
                    } else if (c == INT_AMP) {
                        ++mEntityCount;
                    } else if (c == INT_RBRACKET) {
                        ++mBracketCount;
                    }
                }
                // !!! TODO: xml1.1, 0x7F?
                if (outPtr >= outBuf.length) {
                    outPtr = 0;
                }
                outBuf[outPtr++] = (char) c;
            }

            c = decodeMultiByteChar(c, ptr);
            if (c < 0) { // surrogate pair
                if (outPtr >= outBuf.length) {
                    outPtr = 0;
                }
                c = -c;
                // Let's add first part right away:
                outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output in general loop
            }
            if (outPtr >= outBuf.length) {
                outPtr = 0;
            }
            outBuf[outPtr++] = (char) c;
        }
        return mByteCount;
    }

    private int testScannerInts()
        throws IOException
    {
        int outPtr = 0;
        int c = 0;
        final int[] TYPES = CHAR_TYPES;

        final byte[] inputBuffer = mInputBuffer;
        final char[] outputBuffer = mOutputBuffer;

        mInputLen = 0;
        mInputPtr = 0;

        main_loop:
        while (true) {
            // Next thing: let's get the first byte:
            int ptr = mInputPtr;

            ascii_loop:
            while (true) {
                if (ptr >= mInputLen) {
                    if (!loadMoreBytes()) {
                        break main_loop;
                    }
                    ptr = mInputPtr;
                }
                c = (int) inputBuffer[ptr++] & 0xFF;
                int type = TYPES[c];
                if (type != 0) {
                    switch (type) {                        
                    case 1:
                        throw new Error("Invalid white space");
                    case 2:
                        if (c == INT_CR) {
                            ++mLineNr;
                        } else if (c == INT_LF) {
                            ++mLineNr;
                        }
                        break;
                    case 3:
                        ++mTagCount;
                        break;
                    case 4:
                        ++mEntityCount;
                        break;
                    case 5:
                        ++mBracketCount;
                        break;
                    case 6: // 2 bytes
                    case 7: // 3 bytes
                    case 8: // 4 bytes
                        break ascii_loop;
                    default:
                        throw new Error();
                    }
                }
                if (outPtr >= outputBuffer.length) {
                    outPtr = 0;
                }
                outputBuffer[outPtr++] = (char) c;
            }

            c = decodeMultiByteChar(c, ptr);
            if (c < 0) { // surrogate pair
                if (outPtr >= outputBuffer.length) {
                    outPtr = 0;
                }
                c = -c;
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output in general loop
            }
            if (outPtr >= outputBuffer.length) {
                outPtr = 0;
            }
            outputBuffer[outPtr++] = (char) c;
        }
        return mByteCount;
    }

    private int testScannerInts2()
        throws IOException
    {
        int outPtr = 0;
        int c = 0;
        final int[] TYPES = CHAR_TYPES;

        final byte[] inputBuffer = mInputBuffer;
        char[] outputBuffer = mOutputBuffer;

        mInputLen = 0;
        mInputPtr = 0;

        main_loop:
        while (true) {
            // Next thing: let's get the first byte:

            ascii_loop:
            while (true) {
                int ptr = mInputPtr;
                if (ptr >= mInputLen) {
                    if (!loadMoreBytes()) {
                        break main_loop;
                    }
                    ptr = mInputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = mOutputBuffer;
                    outPtr = 0;
                }
                int max = mInputLen;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        mInputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                mInputPtr = ptr;
            }

            switch (TYPES[c]) {
            case 1:
                throw new Error("Invalid white space");
            case 2:
                if (c == INT_CR) {
                    ++mLineNr;
                } else if (c == INT_LF) {
                    ++mLineNr;
                }
                break;
            case 3:
                ++mTagCount;
                break;
            case 4:
                // should expand entity
                ++mEntityCount;
                break;
            case 5:
                ++mBracketCount;
                break;
            case 6: // 2 bytes
                c = decodeMultiByteChar(c, mInputPtr);
                break;
            case 7: // 3 bytes
                c = decodeMultiByteChar(c, mInputPtr);
                break;
            case 8: // 4 bytes
                {
                    c = decodeMultiByteChar(c, mInputPtr);
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = mOutputBuffer;
                        outPtr = 0;
                    }
                    outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                    c = 0xDC00 | (c & 0x3FF);
                }
                break;
            default:
                throw new Error();
            }
            
            if (outPtr >= outputBuffer.length) {
                outputBuffer = mOutputBuffer;
                outPtr = 0;
            }
            outputBuffer[outPtr++] = (char) c;
        }
        return mByteCount;
    }

    /*
    private final int decode(int ptr, int c, int type)
        throws IOException
    {
        switch (type) {
        case 1:
            throw new Error("Invalid white space");
        case 2:
            if (c == INT_CR) {
                ++mLineNr;
            } else if (c == INT_LF) {
                ++mLineNr;
            }
            break;
        case 3:
            ++mTagCount;
            break;
        case 4:
            // should expand entity
            ++mEntityCount;
            break;
        case 5:
            ++mBracketCount;
            break;
        case 6: // 2 bytes
        case 7: // 3 bytes
        case 8: // 4 bytes
            c = decodeMultiByteChar(c, ptr);
            break;
        default:
            throw new Error();
        }
        mInputPtr = ptr;
        return c;
    }
    */

    private final boolean loadMoreBytes()
        throws IOException
    {
        mByteCount += mInputLen;
        mInputPtr = 0;
        int count = mIn.read(mInputBuffer);
        if (count < 0) {
            mInputLen = 0;
            return false;
        }
        mInputLen = count;
        return true;
    }

    private final void loadMoreBytesGuaranteed()
        throws IOException
    {
        if (!loadMoreBytes()) {
            throw new Error();
        }
    }

    /*
    private final void markLF()
    {
        ++mLineNr;
    }

    private final void markLF(int pos)
    {
        ++mLineNr;
    }

    private final int handleEntityInText()
    {
        ++mEntityCount;
        return '&';
    }
    */

    private final int decodeMultiByteChar(int c, int ptr)
        throws IOException
    {
        int needed;

        if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
            c &= 0x1F;
            needed = 1;
        } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
            c &= 0x0F;
            needed = 2;
        } else if ((c & 0xF8) == 0xF0) {
            // 4 bytes; double-char with surrogates and all...
            c &= 0x07;
            needed = 3;
        } else {
            throw new Error("Unexpected multi-byte first byte 0x"+Integer.toHexString(c));
        }

        if (ptr >= mInputLen) { // 2nd byte
            loadMoreBytesGuaranteed();
            ptr = mInputPtr;
        }
        int d = (int) mInputBuffer[ptr++];
        if ((d & 0xC0) != 0x080) {
            throw new Error();
        }
        c = (c << 6) | (d & 0x3F);
        
        if (needed > 1) { // needed == 1 means 2 bytes total
            if (ptr >= mInputLen) {
                loadMoreBytesGuaranteed();
                ptr = mInputPtr;
            }
            d = (int) mInputBuffer[ptr++];
            if ((d & 0xC0) != 0x080) {
                throw new Error();
            }
            c = (c << 6) | (d & 0x3F);
            if (needed > 2) { // 4 bytes? (need surrogates)
                if (ptr >= mInputLen) {
                    loadMoreBytesGuaranteed();
                    ptr = mInputPtr;
                }
                d = (int) mInputBuffer[ptr++];
                if ((d & 0xC0) != 0x080) {
                    throw new Error();
                }
                c = (c << 6) | (d & 0x3F);
                /* Need to signal such pair differently (to make comparison
                 * easier)
                 */
                return -c;
            }
        }
        mInputPtr = ptr;
        return c;
    }

    /*
    private final int decodeMultiByteChar(int c, int type, int ptr)
        throws IOException
    {
        // let's see how many add'l bytes are needed
        type -= MB_CODE_BASE;
        c &= (0x3F >> type); // 1f/0f/07 (for 2/3/4 bytes)
        if (ptr >= mInputEnd) { // 2nd byte
            loadMoreBytesGuaranteed();
            ptr = mInputPtr;
        }
        int d = (int) mInputBuffer[ptr++];
        if ((d & 0xC0) != 0x080) {
            throw new Error();
        }
        c = (c << 6) | (d & 0x3F);
        
        if (type > 1) { // needed == 1 means 2 bytes total
            if (ptr >= mInputEnd) {
                loadMoreBytesGuaranteed();
                ptr = mInputPtr;
            }
            d = (int) mInputBuffer[ptr++];
            if ((d & 0xC0) != 0x080) {
                throw new Error();
            }
            c = (c << 6) | (d & 0x3F);
            if (type > 2) { // 4 bytes? (need surrogates)
                if (ptr >= mInputEnd) {
                    loadMoreBytesGuaranteed();
                    ptr = mInputPtr;
                }
                d = (int) mInputBuffer[ptr++];
                if ((d & 0xC0) != 0x080) {
                    throw new Error();
                }
                c = (c << 6) | (d & 0x3F);
                // Need to signal such pair differently (to make comparison
                // easier)
                return -c;
            }
        }
        mInputPtr = ptr;
        return c;
    }
    */

    private static byte[] readData(File f)
        throws IOException
    {
        int len = (int) f.length();
        byte[] data = new byte[len];
        int offset = 0;
        FileInputStream fis = new FileInputStream(f);
        
        while (len > 0) {
            int count = fis.read(data, offset, len-offset);
            offset += count;
            len -= count;
        }
        fis.close();
        return data;
    }

    public static void main(String[] args)
        throws IOException
    {
        if (args.length != 1) {
            System.err.println("Usage: java ... [input file]");
            System.exit(1);
        }
        byte[] data = readData(new File(args[0]));
        int len = data.length;
        int repCount = 1;

        int THRESHOLD = 10 * 1000 * 1000;

        if (len < THRESHOLD) {
            repCount = (THRESHOLD / len);
        }
        //if (repCount > 2) { repCount /= 2; }

        System.out.println("Ok, read in test data, "+len+" bytes; using "+repCount+" repetitions");
        new TestScannerPerf(data, repCount).test();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy