com.google.zxing.aztec.encoder.HighLevelEncoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.zxing
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
There is a newer version: 3.5.3_1
/*
 * Copyright 2013 ZXing authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.zxing.aztec.encoder;

import com.google.zxing.common.BitArray;
import com.google.zxing.common.CharacterSetECI;

import java.nio.charset.Charset;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;

/**
 * This produces nearly optimal encodings of text into the first-level of
 * encoding used by Aztec code.
 *
 * It uses a dynamic algorithm.  For each prefix of the string, it determines
 * a set of encodings that could lead to this prefix.  We repeatedly add a
 * character and generate a new set of optimal encodings until we have read
 * through the entire input.
 *
 * @author Frank Yellin
 * @author Rustam Abdullaev
 */
public final class HighLevelEncoder {

  static final String[] MODE_NAMES = {"UPPER", "LOWER", "DIGIT", "MIXED", "PUNCT"};

  static final int MODE_UPPER = 0; // 5 bits
  static final int MODE_LOWER = 1; // 5 bits
  static final int MODE_DIGIT = 2; // 4 bits
  static final int MODE_MIXED = 3; // 5 bits
  static final int MODE_PUNCT = 4; // 5 bits

  // The Latch Table shows, for each pair of Modes, the optimal method for
  // getting from one mode to another.  In the worst possible case, this can
  // be up to 14 bits.  In the best possible case, we are already there!
  // The high half-word of each entry gives the number of bits.
  // The low half-word of each entry are the actual bits necessary to change
  static final int[][] LATCH_TABLE = {
    {
      0,
      (5 << 16) + 28,              // UPPER -> LOWER
      (5 << 16) + 30,              // UPPER -> DIGIT
      (5 << 16) + 29,              // UPPER -> MIXED
      (10 << 16) + (29 << 5) + 30, // UPPER -> MIXED -> PUNCT
    },
    {
      (9 << 16) + (30 << 4) + 14,  // LOWER -> DIGIT -> UPPER
      0,
      (5 << 16) + 30,              // LOWER -> DIGIT
      (5 << 16) + 29,              // LOWER -> MIXED
      (10 << 16) + (29 << 5) + 30, // LOWER -> MIXED -> PUNCT
    },
    {
      (4 << 16) + 14,              // DIGIT -> UPPER
      (9 << 16) + (14 << 5) + 28,  // DIGIT -> UPPER -> LOWER
      0,
      (9 << 16) + (14 << 5) + 29,  // DIGIT -> UPPER -> MIXED
      (14 << 16) + (14 << 10) + (29 << 5) + 30,
                                   // DIGIT -> UPPER -> MIXED -> PUNCT
    },
    {
      (5 << 16) + 29,              // MIXED -> UPPER
      (5 << 16) + 28,              // MIXED -> LOWER
      (10 << 16) + (29 << 5) + 30, // MIXED -> UPPER -> DIGIT
      0,
      (5 << 16) + 30,              // MIXED -> PUNCT
    },
    {
      (5 << 16) + 31,              // PUNCT -> UPPER
      (10 << 16) + (31 << 5) + 28, // PUNCT -> UPPER -> LOWER
      (10 << 16) + (31 << 5) + 30, // PUNCT -> UPPER -> DIGIT
      (10 << 16) + (31 << 5) + 29, // PUNCT -> UPPER -> MIXED
      0,
    },
  };

  // A reverse mapping from [mode][char] to the encoding for that character
  // in that mode.  An entry of 0 indicates no mapping exists.
  private static final int[][] CHAR_MAP = new int[5][256];
  static {
    CHAR_MAP[MODE_UPPER][' '] = 1;
    for (int c = 'A'; c <= 'Z'; c++) {
      CHAR_MAP[MODE_UPPER][c] = c - 'A' + 2;
    }
    CHAR_MAP[MODE_LOWER][' '] = 1;
    for (int c = 'a'; c <= 'z'; c++) {
      CHAR_MAP[MODE_LOWER][c] = c - 'a' + 2;
    }
    CHAR_MAP[MODE_DIGIT][' '] = 1;
    for (int c = '0'; c <= '9'; c++) {
      CHAR_MAP[MODE_DIGIT][c] = c - '0' + 2;
    }
    CHAR_MAP[MODE_DIGIT][','] = 12;
    CHAR_MAP[MODE_DIGIT]['.'] = 13;
    int[] mixedTable = {
        '\0', ' ', '\1', '\2', '\3', '\4', '\5', '\6', '\7', '\b', '\t', '\n',
        '\13', '\f', '\r', '\33', '\34', '\35', '\36', '\37', '@', '\\', '^',
        '_', '`', '|', '~', '\177'
    };
    for (int i = 0; i < mixedTable.length; i++) {
      CHAR_MAP[MODE_MIXED][mixedTable[i]] = i;
    }
    int[] punctTable = {
        '\0', '\r', '\0', '\0', '\0', '\0', '!', '\'', '#', '$', '%', '&', '\'',
        '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
        '[', ']', '{', '}'
    };
    for (int i = 0; i < punctTable.length; i++) {
      if (punctTable[i] > 0) {
        CHAR_MAP[MODE_PUNCT][punctTable[i]] = i;
      }
    }
  }

  // A map showing the available shift codes.  (The shifts to BINARY are not
  // shown
  static final int[][] SHIFT_TABLE = new int[6][6]; // mode shift codes, per table
  static {
    for (int[] table : SHIFT_TABLE) {
      Arrays.fill(table, -1);
    }
    SHIFT_TABLE[MODE_UPPER][MODE_PUNCT] = 0;

    SHIFT_TABLE[MODE_LOWER][MODE_PUNCT] = 0;
    SHIFT_TABLE[MODE_LOWER][MODE_UPPER] = 28;

    SHIFT_TABLE[MODE_MIXED][MODE_PUNCT] = 0;

    SHIFT_TABLE[MODE_DIGIT][MODE_PUNCT] = 0;
    SHIFT_TABLE[MODE_DIGIT][MODE_UPPER] = 15;
  }

  private final byte[] text;
  private final Charset charset;

  public HighLevelEncoder(byte[] text) {
    this.text = text;
    this.charset = null;
  }

  public HighLevelEncoder(byte[] text, Charset charset) {
    this.text = text;
    this.charset = charset;
  }

  /**
   * @return text represented by this encoder encoded as a {@link BitArray}
   */
  public BitArray encode() {
    State initialState = State.INITIAL_STATE;
    if (charset != null) {
      CharacterSetECI eci = CharacterSetECI.getCharacterSetECI(charset);
      if (null == eci) {
        throw new IllegalArgumentException("No ECI code for character set " + charset);
      }
      initialState = initialState.appendFLGn(eci.getValue());
    }
    Collection states = Collections.singletonList(initialState);
    for (int index = 0; index < text.length; index++) {
      int pairCode;
      int nextChar = index + 1 < text.length ? text[index + 1] : 0;
      switch (text[index]) {
        case '\r':
          pairCode = nextChar == '\n' ? 2 : 0;
          break;
        case '.' :
          pairCode = nextChar == ' ' ? 3 : 0;
          break;
        case ',' :
          pairCode = nextChar == ' ' ? 4 : 0;
          break;
        case ':' :
          pairCode = nextChar == ' ' ? 5 : 0;
          break;
        default:
          pairCode = 0;
      }
      if (pairCode > 0) {
        // We have one of the four special PUNCT pairs.  Treat them specially.
        // Get a new set of states for the two new characters.
        states = updateStateListForPair(states, index, pairCode);
        index++;
      } else {
        // Get a new set of states for the new character.
        states = updateStateListForChar(states, index);
      }
    }
    // We are left with a set of states.  Find the shortest one.
    State minState = Collections.min(states, new Comparator() {
      @Override
      public int compare(State a, State b) {
        return a.getBitCount() - b.getBitCount();
      }
    });
    // Convert it to a bit array, and return.
    return minState.toBitArray(text);
  }

  // We update a set of states for a new character by updating each state
  // for the new character, merging the results, and then removing the
  // non-optimal states.
  private Collection updateStateListForChar(Iterable states, int index) {
    Collection result = new LinkedList<>();
    for (State state : states) {
      updateStateForChar(state, index, result);
    }
    return simplifyStates(result);
  }

  // Return a set of states that represent the possible ways of updating this
  // state for the next character.  The resulting set of states are added to
  // the "result" list.
  private void updateStateForChar(State state, int index, Collection result) {
    char ch = (char) (text[index] & 0xFF);
    boolean charInCurrentTable = CHAR_MAP[state.getMode()][ch] > 0;
    State stateNoBinary = null;
    for (int mode = 0; mode <= MODE_PUNCT; mode++) {
      int charInMode = CHAR_MAP[mode][ch];
      if (charInMode > 0) {
        if (stateNoBinary == null) {
          // Only create stateNoBinary the first time it's required.
          stateNoBinary = state.endBinaryShift(index);
        }
        // Try generating the character by latching to its mode
        if (!charInCurrentTable || mode == state.getMode() || mode == MODE_DIGIT) {
          // If the character is in the current table, we don't want to latch to
          // any other mode except possibly digit (which uses only 4 bits).  Any
          // other latch would be equally successful *after* this character, and
          // so wouldn't save any bits.
          State latchState = stateNoBinary.latchAndAppend(mode, charInMode);
          result.add(latchState);
        }
        // Try generating the character by switching to its mode.
        if (!charInCurrentTable && SHIFT_TABLE[state.getMode()][mode] >= 0) {
          // It never makes sense to temporarily shift to another mode if the
          // character exists in the current mode.  That can never save bits.
          State shiftState = stateNoBinary.shiftAndAppend(mode, charInMode);
          result.add(shiftState);
        }
      }
    }
    if (state.getBinaryShiftByteCount() > 0 || CHAR_MAP[state.getMode()][ch] == 0) {
      // It's never worthwhile to go into binary shift mode if you're not already
      // in binary shift mode, and the character exists in your current mode.
      // That can never save bits over just outputting the char in the current mode.
      State binaryState = state.addBinaryShiftChar(index);
      result.add(binaryState);
    }
  }

  private static Collection updateStateListForPair(Iterable states, int index, int pairCode) {
    Collection result = new LinkedList<>();
    for (State state : states) {
      updateStateForPair(state, index, pairCode, result);
    }
    return simplifyStates(result);
  }

  private static void updateStateForPair(State state, int index, int pairCode, Collection result) {
    State stateNoBinary = state.endBinaryShift(index);
    // Possibility 1.  Latch to MODE_PUNCT, and then append this code
    result.add(stateNoBinary.latchAndAppend(MODE_PUNCT, pairCode));
    if (state.getMode() != MODE_PUNCT) {
      // Possibility 2.  Shift to MODE_PUNCT, and then append this code.
      // Every state except MODE_PUNCT (handled above) can shift
      result.add(stateNoBinary.shiftAndAppend(MODE_PUNCT, pairCode));
    }
    if (pairCode == 3 || pairCode == 4) {
      // both characters are in DIGITS.  Sometimes better to just add two digits
      State digitState = stateNoBinary
          .latchAndAppend(MODE_DIGIT, 16 - pairCode)  // period or comma in DIGIT
          .latchAndAppend(MODE_DIGIT, 1);             // space in DIGIT
      result.add(digitState);
    }
    if (state.getBinaryShiftByteCount() > 0) {
      // It only makes sense to do the characters as binary if we're already
      // in binary mode.
      State binaryState = state.addBinaryShiftChar(index).addBinaryShiftChar(index + 1);
      result.add(binaryState);
    }
  }

  private static Collection simplifyStates(Iterable states) {
    Deque result = new LinkedList<>();
    for (State newState : states) {
      boolean add = true;
      for (Iterator iterator = result.iterator(); iterator.hasNext();) {
        State oldState = iterator.next();
        if (oldState.isBetterThanOrEqualTo(newState)) {
          add = false;
          break;
        }
        if (newState.isBetterThanOrEqualTo(oldState)) {
          iterator.remove();
        }
      }
      if (add) {
        result.addFirst(newState);
      }
    }
    return result;
  }

}