org.jcodings.Encoding Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcodings Show documentation
Show all versions of jcodings Show documentation
Byte based encoding support library for java
The newest version!
/*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.jcodings;
import java.nio.charset.Charset;
import org.jcodings.ascii.AsciiTables;
import org.jcodings.constants.CharacterType;
import org.jcodings.exception.EncodingException;
import org.jcodings.exception.EncodingError;
import org.jcodings.exception.ErrorMessages;
import org.jcodings.exception.InternalException;
import org.jcodings.util.BytesHash;
public abstract class Encoding implements Cloneable {
public static final int CHAR_INVALID = -1;
private static int count;
protected final int minLength, maxLength;
private final boolean isFixedWidth, isSingleByte;
private boolean isAsciiCompatible;
protected boolean isUnicode = false, isUTF8 = false;
private byte[]name;
private int hashCode;
private int index;
private Charset charset = null;
private boolean isDummy = false;
private String stringName;
protected Encoding(String name, int minLength, int maxLength) {
setName(name);
this.minLength = minLength;
this.maxLength = maxLength;
this.isFixedWidth = minLength == maxLength;
this.isSingleByte = isFixedWidth && minLength == 1;
this.index = count++;
this.isAsciiCompatible = minLength == 1;
}
protected final void setName(String name) {
this.name = name.getBytes();
this.hashCode = BytesHash.hashCode(this.name, 0, this.name.length);
this.stringName = name;
}
protected final void setName(byte[]name) {
this.name = name;
this.hashCode = BytesHash.hashCode(this.name, 0, this.name.length);
this.stringName = new String(name);
}
protected final void setDummy() {
isDummy = true;
isAsciiCompatible = false;
}
@Override
public final String toString() {
return stringName;
}
@Override
public final boolean equals(Object other) {
return this == other;
}
@Override
public final int hashCode() {
return hashCode;
}
public final int getIndex() {
return index;
}
public final byte[]getName() {
return name;
}
public final boolean isDummy() {
return isDummy;
}
public final boolean isAsciiCompatible() {
return isAsciiCompatible;
}
public final boolean isUnicode() {
return isUnicode;
}
public final boolean isUTF8() {
return isUTF8;
}
/**
* If this encoding is capable of being represented by a Java Charset
* then provide it. Otherwise this will raise a CharsetNotFound error via the JDK APIs.
*
* To reduce cases like jruby/jruby#4716, we always attempt to find a charset here, and default to using the
* encoding name which is never null. Either the encoding will exist in the JDK or it will fail hard, rather
* than propagating a null Charset. Encodings with names different than those found in the JDK can override
* this getCharsetName to provide that name or getCharset to return the right Charset.
*/
public Charset getCharset() {
if (charset == null) {
charset = Charset.forName(getCharsetName());
}
return charset;
}
/**
* The name of the equivalent Java Charset for this encoding.
*
* Defaults to the name of the encoding. Subclasses can override this to provide a different name.
*
* @return the name of the equivalent Java Charset for this encoding
*/
public String getCharsetName() {
return stringName;
}
Encoding replicate(byte[]name) {
try {
Encoding clone = (Encoding)clone();
clone.setName(name);
clone.index = count++;
return clone;
} catch (CloneNotSupportedException cnse){
throw new EncodingException(EncodingError.ERR_COULD_NOT_REPLICATE, new String(name));
}
}
/**
* Returns character length given character head
* returns 1
for singlebyte encodings or performs direct length table lookup for multibyte ones.
*
* @param c
* Character head
* Oniguruma equivalent: mbc_enc_len
*
* To be deprecated very soon (use length(byte[]bytes, int p, int end) version)
*/
public abstract int length(byte c);
/**
* Returns character length given stream, character position and stream end
* returns 1
for singlebyte encodings or performs sanity validations for multibyte ones
* and returns the character length, missing characters in the stream otherwise
*
* @return
* 0 Never
* > 0 Valid character, length returned
* -1 Illegal/malformed character
* < -1 (-1 - n) Number of missing bytes for character in p...end range
*
* Oniguruma equivalent: mbc_enc_len
* modified for 1.9 purposes,
*/
public abstract int length(byte[]bytes, int p, int end);
/**
* Returns maximum character byte length that can appear in an encoding
*
* Oniguruma equivalent: max_enc_len
*/
public final int maxLength() {
return maxLength;
}
/* ONIGENC_MBC_MAXLEN_DIST */
@Deprecated
public final int maxLengthDistance() {
return maxLength();
}
/**
* Returns minimum character byte length that can appear in an encoding
*
* Oniguruma equivalent: min_enc_len
*/
public final int minLength() {
return minLength;
}
/**
* Returns true if bytes[p]
is a head of a new line character
*
* Oniguruma equivalent: is_mbc_newline
*/
public abstract boolean isNewLine(byte[]bytes, int p, int end);
/**
* Returns code point for a character
*
* Oniguruma equivalent: mbc_to_code
*/
public abstract int mbcToCode(byte[]bytes, int p, int end);
/**
* Returns character length given a code point
*
* Oniguruma equivalent: code_to_mbclen
*/
public abstract int codeToMbcLength(int code);
/**
* Extracts code point into it's multibyte representation
*
* @return character length for the given code point
*
* Oniguruma equivalent: code_to_mbc
*/
public abstract int codeToMbc(int code, byte[]bytes, int p);
/**
* Performs case folding for a character at bytes[pp.value]
*
* @param flag case fold flag
* @param pp an IntHolder
that points at character head
* @param to a buffer where to extract case folded character
*
* Oniguruma equivalent: mbc_case_fold
*/
public abstract int mbcCaseFold(int flag, byte[]bytes, IntHolder pp, int end, byte[]to);
/**
* Returns lower case table if it's safe to use it directly, otherwise null
* Used for fast case insensitive matching for some singlebyte encodings
*
* @return lower case table
*/
public byte[] toLowerCaseTable() {return null;}
/**
* Expand case folds given a character class (used for case insensitive matching)
*
* @param flag case fold flag
* @param fun case folding functor (look at: ApplyCaseFold
)
* @param arg case folding functor argument (look at: ApplyCaseFoldArg
)
*
* Oniguruma equivalent: apply_all_case_fold
*/
public abstract void applyAllCaseFold(int flag, ApplyAllCaseFoldFunction fun, Object arg);
/**
* Expand AST string nodes into their folded alternatives (look at: Analyser.expandCaseFoldString
)
*
* Oniguruma equivalent: get_case_fold_codes_by_str
*/
public abstract CaseFoldCodeItem[]caseFoldCodesByString(int flag, byte[]bytes, int p, int end);
/**
* Returns character type given character type name (used when e.g. \p{Alpha})
*
* Oniguruma equivalent: property_name_to_ctype
*/
public abstract int propertyNameToCType(byte[]bytes, int p, int end);
/**
* Perform a check whether given code is of given character type (e.g. used by isWord(someByte) and similar methods)
*
* @param code a code point of a character
* @param ctype a character type to check against
*
* Oniguruma equivalent: is_code_ctype
*/
public abstract boolean isCodeCType(int code, int ctype);
/**
* Returns code range for a given character type
*
* Oniguruma equivalent: get_ctype_code_range
*/
public abstract int[]ctypeCodeRange(int ctype, IntHolder sbOut);
/**
* Seeks the previous character head in a stream
*
* Oniguruma equivalent: left_adjust_char_head
*
* @param bytes byte stream
* @param p position
* @param s stop
* @param end end
*/
public abstract int leftAdjustCharHead(byte[]bytes, int p, int s, int end);
/**
* Returns true if it's safe to use reversal Boyer-Moore search fail fast algorithm
*
* Oniguruma equivalent: is_allowed_reverse_match
*/
public abstract boolean isReverseMatchAllowed(byte[]bytes, int p, int end);
/**
*
* Oniguruma equivalent: case_map
*/
public abstract int caseMap(IntHolder flagP, byte[]bytes, IntHolder pp, int end, byte[]to, int toP, int toEnd);
/* onigenc_get_right_adjust_char_head / ONIGENC_LEFT_ADJUST_CHAR_HEAD */
public final int rightAdjustCharHead(byte[]bytes, int p, int s, int end) {
int p_ = leftAdjustCharHead(bytes, p, s, end);
if (p_ < s) p_ += length(bytes, p_, end);
return p_;
}
/* onigenc_get_right_adjust_char_head_with_prev */
public final int rightAdjustCharHeadWithPrev(byte[]bytes, int p, int s, int end, IntHolder prev) {
int p_ = leftAdjustCharHead(bytes, p, s, end);
if (p_ < s) {
if (prev != null) prev.value = p_;
p_ += length(bytes, p_, end);
} else {
if (prev != null) prev.value = -1; /* Sorry */
}
return p_;
}
/* onigenc_get_prev_char_head */
public final int prevCharHead(byte[]bytes, int p, int s, int end) {
if (s <= p) return -1; // ??
return leftAdjustCharHead(bytes, p, s - 1, end);
}
/* onigenc_step_back */
public final int stepBack(byte[]bytes, int p, int s, int end, int n) {
while (s != -1 && n-- > 0) {
if (s <= p) return -1;
s = leftAdjustCharHead(bytes, p, s - 1, end);
}
return s;
}
/* onigenc_step */
public final int step(byte[]bytes, int p, int end, int n) {
int q = p;
while (n-- > 0) {
q += length(bytes, q, end);
}
return q <= end ? q : -1;
}
/* onigenc_strlen */
public abstract int strLength(byte[]bytes, int p, int end);
public abstract int strCodeAt(byte[]bytes, int p, int end, int index);
/* onigenc_strlen_null */
public final int strLengthNull(byte[]bytes, int p, int end) {
int n = 0;
while (true) {
if (bytes[p] == 0) {
int len = minLength();
if (len == 1) return n;
int q = p + 1;
while (len > 1) {
if (bytes[q] != 0) break;
q++;
len--;
}
if (len == 1) return n;
}
p += length(bytes, p, end);
n++;
}
}
/* onigenc_str_bytelen_null */
public final int strByteLengthNull(byte[]bytes, int p, int end) {
int p_, start;
p_ = start = 0;
while (true) {
if (bytes[p_] == 0) {
int len = minLength();
if (len == 1) return p_ - start;
int q = p_ + 1;
while (len > 1) {
if (q >= bytes.length) return p_ - start;
if (bytes[q] != 0) break;
q++;
len--;
}
if (len == 1) return p_ - start;
}
p_ += length(bytes, p_, end);
}
}
/* onigenc_with_ascii_strncmp */
public final int strNCmp(byte[]bytes, int p, int end, byte[]ascii, int asciiP, int n) {
while (n-- > 0) {
if (p >= end) return ascii[asciiP];
int c = mbcToCode(bytes, p, end);
int x = ascii[asciiP] - c;
if (x != 0) return x;
asciiP++;
p += length(bytes, p, end);
}
return 0;
}
public final boolean isNewLine(int code) {
return isCodeCType(code, CharacterType.NEWLINE);
}
public final boolean isGraph(int code) {
return isCodeCType(code, CharacterType.GRAPH);
}
public final boolean isPrint(int code) {
return isCodeCType(code, CharacterType.PRINT);
}
public final boolean isAlnum(int code) {
return isCodeCType(code, CharacterType.ALNUM);
}
public final boolean isAlpha(int code) {
return isCodeCType(code, CharacterType.ALPHA);
}
public final boolean isLower(int code) {
return isCodeCType(code, CharacterType.LOWER);
}
public final boolean isUpper(int code) {
return isCodeCType(code, CharacterType.UPPER);
}
public final boolean isCntrl(int code) {
return isCodeCType(code, CharacterType.CNTRL);
}
public final boolean isPunct(int code) {
return isCodeCType(code, CharacterType.PUNCT);
}
public final boolean isSpace(int code) {
return isCodeCType(code, CharacterType.SPACE);
}
public final boolean isBlank(int code) {
return isCodeCType(code, CharacterType.BLANK);
}
public final boolean isDigit(int code) {
return isCodeCType(code, CharacterType.DIGIT);
}
public final boolean isXDigit(int code) {
return isCodeCType(code, CharacterType.XDIGIT);
}
public final boolean isWord(int code) {
return isCodeCType(code, CharacterType.WORD);
}
// ONIGENC_IS_MBC_WORD
public final boolean isMbcWord(byte[]bytes, int p, int end) {
return isWord(mbcToCode(bytes, p, end));
}
// IS_CODE_SB_WORD
public final boolean isSbWord(int code) {
return isAscii(code) && isWord(code);
}
// ONIGENC_IS_MBC_HEAD
public final boolean isMbcHead(byte[]bytes, int p, int end) {
return length(bytes, p, end) != 1;
}
public boolean isMbcCrnl(byte[]bytes, int p, int end) {
return mbcToCode(bytes, p, end) == 13 && isNewLine(bytes, p + length(bytes, p, end), end);
}
// ============================================================
// helpers
// ============================================================
public static int digitVal(int code) {
return code - '0';
}
public static int odigitVal(int code) {
return digitVal(code);
}
public final int xdigitVal(int code) {
if (isDigit(code)) {
return digitVal(code);
} else {
return isUpper(code) ? code - 'A' + 10 : code - 'a' + 10;
}
}
// ONIGENC_IS_MBC_ASCII
public static boolean isMbcAscii(byte b) {
return (b & 0xff) < 128; // b > 0 ?
}
// ONIGENC_IS_CODE_ASCII
public static boolean isAscii(int code) {
return code < 128;
}
public static boolean isAscii(byte b) {
return b >= 0;
}
public static byte asciiToLower(int c) {
return AsciiTables.ToLowerCaseTable[c];
}
public static byte asciiToUpper(int c) {
return AsciiTables.ToUpperCaseTable[c];
}
public static boolean isWordGraphPrint(int ctype) {
return ctype == CharacterType.WORD ||
ctype == CharacterType.GRAPH ||
ctype == CharacterType.PRINT;
}
@Deprecated
public final int mbcodeStartPosition() {
return minLength() > 1 ? 0 : 0x80;
}
public final boolean isSingleByte() {
return isSingleByte;
}
public final boolean isFixedWidth() {
return isFixedWidth;
}
public static final byte NEW_LINE = (byte)0x0a;
public static Encoding load(String name) {
return load(name, "org.jcodings.specific");
}
public static Encoding load(String name, String pkg) {
String encClassName = pkg + "." + name + "Encoding";
Class> encClass;
try {
encClass = Class.forName(encClassName);
} catch (ClassNotFoundException cnfe) {
throw new InternalException(ErrorMessages.ERR_ENCODING_CLASS_DEF_NOT_FOUND, encClassName);
}
try {
return (Encoding)encClass.getField("INSTANCE").get(encClass);
} catch (Exception e2) {
throw new InternalException(ErrorMessages.ERR_ENCODING_LOAD_ERROR, encClassName);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy