All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.api.json.JSONStreamTokenizer Maven / Gradle / Ivy

Go to download

API's to manipulate JSON modeled after IBM's json4j including key-sorted serialization

There is a newer version: 4.0.0
Show newest version
/**
 * (c) Copyright 2018-2023 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.api.json;

import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.StreamTokenizer;

/**
 * This class provides similar services to {@link StreamTokenizer} but it works
 * on a character by character basis, whereas the StreamTokenizer is based on
 * individual bytes so is not handling solidus nor unicode correctly. This class
 * is solely for parsing JSON content.
 */
public class JSONStreamTokenizer {

   // character table flags
   private static final int _IS_ORDINARY = 0x0000;
   private static final int _IS_NUMERIC = 0x0001;
   private static final int _IS_QUOTE = 0x0002;
   private static final int _IS_WHITESPACE = 0x0004;
   private static final int _IS_WORD = 0x008;

   // character constants
   private static final int BSH = '\\';
   private static final int BSP = '\b';
   private static final int CRT = (int) '\r';
   private static final int DPT = (int) '.';
   private static final int DQTE = (int) '"';
   private static final int EOF = -1;
   private static final int FFD = (int) '\f';
   private static final int HYP = (int) '-';
   private static final int NLN = (int) '\n';
   private static final int SPC = (int) ' ';
   private static final int SQTE = (int) '\'';
   private static final int TAB = (int) '\t';

   // text type references
   public static final int TT_EOF = -1;
   public static final int TT_CR = '\r';
   public static final int TT_EOL = '\n';
   public static final int TT_NUMBER = -2;
   private static final int TT_UNKNOWN = -4;
   public static final int TT_WORD = -3;

   // state trackers
   boolean _atEOL = false;
   boolean _eolSignificant = false;
   boolean _isPushedBack = false;
   boolean _lowerCaseMode = false;
   int _nextChar = -1;

   PushbackReader _reader = null;
   StringBuilder _currentValue = new StringBuilder();
   int[] _charTable = new int[65536];

   // compatibility variables
   public double nval = 0.0d;
   public String sval = null;
   public int ttype = TT_UNKNOWN;
   
   public static int LN_CNTR = 0;
   public static int LN_OFFSET = 1;

   /**
    * Constructs the JST from a {@link java.io.Reader}. There is no constructor for
    * an {@link java.io.InputStream} because it can be made into a reader. The same
    * is true for a {@link String}.
    * 
    * @param r
    *          the reader to be read containing JSON content.
    */
   public JSONStreamTokenizer(Reader r) {
      resetSyntax();
      whitespaceChars(0x0000, SPC);
      wordChars('A', 'Z');
      wordChars('a', 'z');
      wordChars(0x00A0, 0x00FF);
      parseNumbers();
      quoteChar(SQTE);
      quoteChar(DQTE);
      _reader = new PushbackReader(r, 2);
   }

   /**
    * Determine if we have reached an end of file condition.
    * 
    * @return true if end of file has been reached.
    */
   private boolean checkEOF() {
      if (_nextChar == EOF) {
         // reached end of file
         ttype = TT_EOF;
         return true;
      }
      return false;
   }

   /**
    * Determine if we are parsing a number
    * 
    * @param location line number and line offset values
    * @return true if we are parsing a number
    * @throws IOException
    *                     if unable to read from the reader
    */
   private boolean checkForNumber(Integer[] location) throws IOException {
      if ((_charTable[_nextChar] & _IS_NUMERIC) != 0) {
         if (_nextChar == HYP) {
            _nextChar = readNextChar();
            location[LN_OFFSET] = location[LN_OFFSET]+1;
            // if next character isn't a number
            if ((_charTable[_nextChar] & _IS_NUMERIC) == 0) {
               // return hyphen as part of a string
               _reader.unread(_nextChar);
               if (location[LN_OFFSET] > 0) {
                  location[LN_OFFSET] = location[LN_OFFSET]-1;
               }
               ttype = HYP;
               return true;
            }
            // else we have started a negative number
            _currentValue.append((char) HYP);
         }
         // keep looking to create the number
         int decimalCount = 0;
         while (true) {
            // accumulate digits while looking for decimal point
            _currentValue.append((char) _nextChar);
            _nextChar = readNextChar();
            location[LN_OFFSET] = location[LN_OFFSET]+1;
            if (_nextChar == EOF) {
               _reader.unread(_nextChar);
               if (location[LN_OFFSET] > 0) {
                  location[LN_OFFSET] = location[LN_OFFSET]-1;
               }
               try {
                  nval = new Double(_currentValue.toString());
                  _currentValue.setLength(0);
                  ttype = TT_NUMBER;
                  return true;
               } catch (NumberFormatException nfe) {
                  // this is part of a word starting with digits
                  break;
               }
            }
            // if this is not a number
            if ((_charTable[_nextChar] & _IS_NUMERIC) == 0 && _nextChar != HYP) {
               _reader.unread(_nextChar);
               if (location[LN_OFFSET] > 0) {
                  location[LN_OFFSET] = location[LN_OFFSET]-1;
               }
               try {
                  nval = new Double(_currentValue.toString());
                  _currentValue.setLength(0);
                  ttype = TT_NUMBER;
                  return true;
               } catch (NumberFormatException nfe) {
                  ; // fall through
               }
               // this is part of a word starting with digits
               break;
            }
            if (_nextChar == DPT) {
               decimalCount++;
            }
            if (decimalCount > 1) {
               // this is part of a word with multiple decimal points
               _reader.unread(_nextChar);
               if (location[LN_OFFSET] > 0) {
                  location[LN_OFFSET] = location[LN_OFFSET]-1;
               }
               break;
            }
            // keep accumulating digits and decimal point
            _currentValue.append((char) _nextChar);
         } // end while is digit
      }
      return false;
   }

   /**
    * Determine if we have reached quoted content
    * 
    * @param location line number and line offset
    * @return true if we have reached quoted content
    * @throws IOException
    *                     if unable to read from the reader
    */
   private boolean checkForQuotedWord(Integer[] location) throws IOException {
      if ((_charTable[_nextChar] & _IS_QUOTE) != 0) {
         ttype = _nextChar; // save quote
         int _lookAhead = readNextChar();
         location[LN_OFFSET] = location[LN_OFFSET]+1;
         // process quoted string, addressing escaped characters and octal
         // codes
         while (_lookAhead != ttype && _lookAhead != NLN && _lookAhead != CRT && _lookAhead != EOF) {
            // handle escaped content
            if (_lookAhead == BSH) {
               _nextChar = readNextChar();
               location[LN_OFFSET] = location[LN_OFFSET]+1;
               // process other escaped characters
               switch (_nextChar) {
               case 't': {
                  _nextChar = TAB;
                  break;
               }
               case 'n': {
                  _nextChar = NLN;
                  break;
               }
               case 'r': {
                  _nextChar = CRT;
                  break;
               }
               case 'f': {
                  _nextChar = FFD;
                  break;
               }
               case 'b': {
                  _nextChar = BSP;
                  break;
               }
               case BSH: {
                  _nextChar = BSH;
                  break;
               }
               case DQTE: {
                  _nextChar = DQTE;
                  break;
               }
               case SQTE: {
                  _nextChar = SQTE;
                  break;
               }
               case 'u': {
                  // part of unicode so need to read next 4 digits
                  int d1, d2, d3, d4 = 0;
                  d1 = readNextChar();
                  location[LN_OFFSET] = location[LN_OFFSET]+1;
                  if (isHexChar(d1)) {
                     d2 = readNextChar();
                     location[LN_OFFSET] = location[LN_OFFSET]+1;
                     if (isHexChar(d2)) {
                        d3 = readNextChar();
                        location[LN_OFFSET] = location[LN_OFFSET]+1;
                        if (isHexChar(d3)) {
                           d4 = readNextChar();
                           location[LN_OFFSET] = location[LN_OFFSET]+1;
                           if (isHexChar(d4)) {
                              char[] cBuf = new char[4];
                              cBuf[0] = (char) d1;
                              cBuf[1] = (char) d2;
                              cBuf[2] = (char) d3;
                              cBuf[3] = (char) d4;
                              String test = new String(cBuf);
                              if (test.equalsIgnoreCase("000a")) {
                                 _nextChar = NLN;
                              } else {
                                 _nextChar = (char) Integer.parseInt(test, 16);
                              }
                           } // end 4th digit
                        } // end 3rd digit
                     } // end 2nd digit
                  } // end 1st digit
                  break;
               } // end unicode parsing
               default: {
                  break;
               }
               } // end switch on escaped character
               _lookAhead = readNextChar();
               location[LN_OFFSET] = location[LN_OFFSET]+1;
            } else { // end dealing with escaped value
               _nextChar = _lookAhead;
               _lookAhead = readNextChar();
               location[LN_OFFSET] = location[LN_OFFSET]+1;
            }
            _currentValue.append((char) _nextChar);
         } // end while looking for matching quote or EOL
         if (_lookAhead != ttype) {
            // hit EOL, not matching quote
            _reader.unread(_lookAhead);
            if (location[LN_OFFSET] > 0) {
               location[LN_OFFSET] = location[LN_OFFSET]-1;
            }
         }
         sval = _currentValue.toString();
         if (_lowerCaseMode) {
            sval = sval.toLowerCase();
         }
         return true;
      }
      return false;
   }

   /**
    * Determine if we have reached a word. Note: this is used for detecting boolean
    * and null values.
    * 
    * @param location line number and line offset
    * @return true if we find unquoted words
    * @throws IOException
    *                     if unable to read from the reader
    */
   private boolean checkForWord(Integer[] location) throws IOException {
      if ((_charTable[_nextChar] & _IS_WORD) != 0) {
         // keep reading until we hit EOF, whitespace
         while ((_charTable[_nextChar] & (_IS_NUMERIC | _IS_WORD)) != 0) {
            _currentValue.append((char) _nextChar);
            _nextChar = readNextChar();
            location[LN_OFFSET] = location[LN_OFFSET]+1;
            if (_nextChar == EOF) {
               // reached end of file
               _reader.unread(_nextChar);
               if (location[LN_OFFSET] > 0) {
                  location[LN_OFFSET] = location[LN_OFFSET]-1;
               }
               sval = _currentValue.toString();
               _currentValue.setLength(0);
               ttype = TT_WORD;
               return true;
            }
         }
         _reader.unread(_nextChar);
         if (location[LN_OFFSET] > 0) {
            location[LN_OFFSET] = location[LN_OFFSET]-1;
         }
         sval = _currentValue.toString();
         _currentValue.setLength(0);
         if (_lowerCaseMode) {
            sval = sval.toLowerCase();
         }
         ttype = TT_WORD;
         return true;
      }
      return false;
   }

   /**
    * Sets reporting when end of line is detected
    * 
    * @param flag
    *             whether or not to report when end of line is detected
    */
   public void eolIsSignificant(boolean flag) {
      _eolSignificant = flag;

   }

   /**
    * Handles an encountered end of line that can be signaled as /r, /r/n, or /n
    * 
    * @param location
    *                 where we are in the input stream
    * @return true if an end of line was processed
    * @throws IOException
    *                     if unable to read from the reader
    */
   boolean handleEndOfLine(Integer[] location) throws IOException {
      if (_nextChar == CRT) {
         // skip /n if there is one, else push back
         _nextChar = readNextChar();
         location[LN_OFFSET] = location[LN_OFFSET]+1;
         if (_nextChar != NLN) {
            _reader.unread(_nextChar);
            if (location[LN_OFFSET] > 0) {
               location[LN_OFFSET] = location[LN_OFFSET]-1;
            }
         } else {
            // eat the newline
            location[JSON.LN_CNTR] = location[JSON.LN_CNTR]+1;
            location[JSON.LN_OFFSET] = 0;
         }
         if (_eolSignificant) {
            ttype = TT_EOL;
            return true;
         }
      } else if (_nextChar == NLN) {
         location[JSON.LN_CNTR] = location[JSON.LN_CNTR]+1;
         location[JSON.LN_OFFSET] = 0;
         if (_eolSignificant) {
            ttype = TT_EOL;
            return true;
         }
         _nextChar = readNextChar();
         location[LN_OFFSET] = location[LN_OFFSET]+1;
         if (_nextChar == EOF) {
            ttype = TT_EOF;
            return true;
         }
      } // else, keep ttype as next character
      return false;
   }

   /**
    * Tests input for hex characters 0-9, A-F, or a-f
    * 
    * @param test
    *             input character to be tested
    * @return true if input is a hex character
    */
   boolean isHexChar(int test) {
      return ( (0x30 <= test && test <= 0x39) ||
               (0x41 <= test && test <= 0x46) ||
               (0x61 <= test && test <= 0x66) );
   }

   /**
    * Discovers the next token in the reader and returns its type (ttype). If a
    * number is detected, it is returned as a double in the nval. If a word or
    * quoted word is detected, it is returned in the sval.
    * 
    * @param location
    *                 where we are reading from the input stream
    * @return the type of next token encountered in the reader
    * @throws IOException
    *                     if unable to read from the reader
    */
   public int nextToken(Integer[] location) throws IOException {
      if (_isPushedBack) {
         _isPushedBack = false;
         return ttype;
      }
      _currentValue.setLength(0);
      sval = null;
      _nextChar = readNextChar();
      location[LN_OFFSET] = location[LN_OFFSET] + 1;
      
      ttype = _nextChar;
      if (checkEOF()) {
         return ttype;
      }
      if (skipNewLines(location)) {
         location[JSON.LN_CNTR] = location[JSON.LN_CNTR]+1;
         location[JSON.LN_OFFSET] = 0;
         if (ttype == SPC) {
            return ttype;
         }
      }
      if (skipWhitespace(location)) {
         return ttype;
      }
      // have first non-whitespace character
      // check if number encountered
      if (checkForNumber(location)) {
         return ttype;
      }
      // if we got here, we are accumulating a word
      if (checkForWord(location)) {
         return ttype;
      }

      // if we got here, we should check for quoted word
      if (checkForQuotedWord(location)) {
         return ttype;
      }

      // else, return whatever character we've found
      return ttype;
   }

   /**
    * Sets the supplied character as ordinary in the character lookup table
    * 
    * @param ch
    *           The character to be tested
    */
   public void ordinaryChar(int ch) {
      _charTable[ch] = _IS_ORDINARY;
   }

   /**
    * Sets the range of characters supplied (including the endpoints) as ordinary
    * in the character lookup table
    * 
    * @param low
    *            starting, lower value character
    * @param hi
    *            ending, higher value character
    */
   public void ordinaryChars(int low, int hi) {
      for (int i = low; i < hi; i++) {
         // clear any previous flags
         _charTable[i] = _IS_ORDINARY;
      }
   }

   /**
    * Initializes the character lookup table for known digits for numeric
    * characters, along with the decimal point and minus sign. TODO: add plus sign?
    */
   public void parseNumbers() {
      // use OR because NUMERIC characters can also be wordChars
      for (int i = 0x0030; i < 0x0039; i++) {
         _charTable[i] |= _IS_NUMERIC;
      }
      _charTable[0x002d] |= _IS_NUMERIC; // '.'
      _charTable[0x002e] |= _IS_NUMERIC; // '-'
   }

   /**
    * Flags that a character has been returned after completing reading the next
    * token.
    * 
    * @throws IOException
    *                     if unable to read from the reader
    */
   public void pushBack() throws IOException {
      if (ttype != TT_UNKNOWN) {
         _isPushedBack = true;
      }
   }

   /**
    * Sets the supplied character as a quote delimiter for quoted word
    * identification in the character lookup table
    * 
    * @param ch
    *           The character defining a quote character
    */
   public void quoteChar(int ch) {
      _charTable[ch] = _IS_QUOTE;
   }

   /**
    * Read the next character from the reader.
    * 
    * @return the next character read from the reader
    * @throws IOException
    *                     if unable to read from the reader
    */
   private int readNextChar() throws IOException {
      if (_reader == null) {
         throw new IllegalStateException();
      }
      return _reader.read();
   }

   /**
    * Resets the character lookup table with single byte characters as ordinary,
    * and double byte characters as parts of words.
    */
   public void resetSyntax() {
      // allow for double byte characters to be digits later
      for (int i = 0; i < _charTable.length; i++) {
         if (i > 255) {
            // initially, consider double byte characters parts of words
            _charTable[i] = _IS_WORD;
         } else {
            // single byte characters are defined later
            _charTable[i] = _IS_ORDINARY;
         }
      }
   }

   /**
    * Reads and eats a new line ('/n') character, setting ttype as the next,
    * non-new line character or the end of file character.
    * 
    * @param location line count and line offset
    * @return true if new lines or end of file was detected
    * @throws IOException
    */
   private boolean skipNewLines(Integer[] location) throws IOException {
      if (_nextChar == NLN) {
         // skip newlines
         _nextChar = readNextChar();
         location[LN_OFFSET] = location[LN_OFFSET]+1;
         if (_nextChar == EOF) {
            ttype = TT_EOF;
            return true;
         }
         ttype = _nextChar;
         return true;
      }
      return false;
   }

   /**
    * Skips whitespace encountered up to a non-whitespace or end of line character,
    * or the end of file has been reached.
    * 
    * @return true if end of line or end of file has been reached.
    * @throws IOException
    *                     if unable to read from the reader
    */
   private boolean skipWhitespace(Integer[] location) throws IOException {
      while ((_charTable[_nextChar] & _IS_WHITESPACE) != 0) {
         // handle newline combinations /r, /r/n, /n
         if (handleEndOfLine(location)) {
            return true;
         } // else this is just whitespace so keep reading
         _nextChar = readNextChar();
         location[LN_OFFSET] = location[LN_OFFSET]+1;
      }
      return false;
   }

   /**
    * Convenience method to see the state of the next token and its type (e.g.,
    * while debugging)
    * 
    * @see java.lang.Object#toString()
    */
   public String toString() {
      String value = "";
      switch (ttype) {
      case TT_WORD: {
         value = sval;
         value = "WORD" + " value:" + value;
         break;
      }
      case TT_EOF: {
         value = "EOF";
         break;
      }
      case TT_EOL: {
         value = "EOL";
         break;
      }
      case TT_NUMBER: {
         value = Double.toString(nval);
         value = "NUMBER:" + " value:" + value;
         break;
      }
      case TT_UNKNOWN: {
         value = "unknown";
         break;
      }
      default: {
         value = "" + (char) ttype + "=0x" + Integer.toHexString(ttype);
         break;
      }
      }
      return "Token[" + value + "]";
   }

   /**
    * Sets the range of characters as whitespace in the character lookup table
    * 
    * @param low
    *            starting, lower range of whitespace characters
    * @param hi
    *            ending, higher range of whitespace characters
    */
   public void whitespaceChars(int low, int hi) {
      for (int i = low; i <= hi; i++) {
         _charTable[i] = _IS_WHITESPACE;
      }
   }

   /**
    * Flags the supplied range of characters as word characters in the lookup
    * table. These characters may also be flagged as numeric characters
    * 
    * @param low
    *            starting, lower character range of word characters
    * @param hi
    *            ending, higher character range of word characters
    */
   public void wordChars(int low, int hi) {
      for (int i = low; i <= hi; i++) {
         // use OR because wordChars can also be NUMERIC
         _charTable[i] |= _IS_WORD;
      }
   }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy