com.adobe.internal.xmp.impl.Latin1Converter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of xmpcore Show documentation
The XMP Library for Java is based on the C++ XMPCore library and the API is similar.
There is a newer version: 6.1.11
// =================================================================================================
// ADOBE SYSTEMS INCORPORATED
// Copyright 2006 Adobe Systems Incorporated
// All Rights Reserved
//
// NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
// of the Adobe license agreement accompanying it.
// =================================================================================================



package com.adobe.internal.xmp.impl;

import java.io.UnsupportedEncodingException;


/**
 * @author  Stefan Makswit
 * @version $Revision$
 * @since   12.10.2006
 */
public class Latin1Converter
{
	/** */
	private static final int STATE_START = 0;
	/** */
	private static final int STATE_UTF8CHAR = 11;


	/**
	 * Private constructor
	 */
	private Latin1Converter()
	{
		// EMPTY
	}


	/**
	 * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars.
	 * The result is a buffer where those chars have been converted to UTF-8;
	 * that means it contains only valid UTF-8 chars.
	 * 
	 * Explanation of the processing: First the encoding of the buffer is detected looking
	 * at the first four bytes (that works only if the buffer starts with an ASCII-char,
	 * like xmls '<'). UTF-16/32 flavours do not require further proccessing.
	 * 

	 * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of
	 * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte
	 * sequence.
	 * 

	 * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code
	 * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined
	 * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a
	 * space.
	 * 

	 * The official Latin-1 characters in the range 0xA0..0xFF are converted into
	 * the Unicode Latin Supplement range U+00A0 - U+00FF.
	 * 
	 * Example: If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC),
	 * it will be left as is. But if only the first two bytes are appearing,
	 * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to
	 * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a).
	 *
	 * @param buffer a byte buffer contain
	 * @return Returns a new buffer containing valid UTF-8
	 */
	public static ByteBuffer convert(ByteBuffer buffer)
	{
		if ("UTF-8".equals(buffer.getEncoding()))
		{
			// the buffer containing one UTF-8 char (up to 8 bytes)
			byte[] readAheadBuffer = new byte[8];
			// the number of bytes read ahead.
			int readAhead  = 0;
			// expected UTF8 bytesto come
			int expectedBytes = 0;
			// output buffer with estimated length
			ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3);

			int state = STATE_START;
			for (int i = 0; i < buffer.length(); i++)
			{
				int b = buffer.charAt(i);

				switch (state)
				{
					default:
					case STATE_START:
						if (b < 0x7F)
						{
							out.append((byte) b);
						}
						else if (b >= 0xC0)
						{
							// start of UTF8 sequence
							expectedBytes = -1;
							int test = b;
							for (; expectedBytes < 8  &&  (test & 0x80) == 0x80; test = test << 1)
							{
								expectedBytes++;
							}
							readAheadBuffer[readAhead++] = (byte) b;
							state = STATE_UTF8CHAR;
						}
						else //  implicitly:  b >= 0x80  &&  b < 0xC0
						{
							// invalid UTF8 start char, assume to be Latin-1
							byte[] utf8 = convertToUTF8((byte) b);
							out.append(utf8);
						}
						break;

					case STATE_UTF8CHAR:
						if (expectedBytes > 0  &&  (b & 0xC0) == 0x80)
						{
							// valid UTF8 char, add to readAheadBuffer
							readAheadBuffer[readAhead++] = (byte) b;
							expectedBytes--;

							if (expectedBytes == 0)
							{
								out.append(readAheadBuffer, 0, readAhead);
								readAhead = 0;

								state = STATE_START;
							}
						}
						else
						{
							// invalid UTF8 char:
							// 1. convert first of seq to UTF8
							byte[] utf8 = convertToUTF8(readAheadBuffer[0]);
							out.append(utf8);

							// 2. continue processing at second byte of sequence
							i = i - readAhead;
							readAhead = 0;

							state = STATE_START;
						}
						break;
				}
			}

			// loop ends with "half" Utf8 char --> assume that the bytes are Latin-1
			if (state == STATE_UTF8CHAR)
			{
				for (int j = 0; j < readAhead; j++)
				{
					byte b = readAheadBuffer[j];
					byte[] utf8 = convertToUTF8(b);
					out.append(utf8);
				}
			}

			return out;
		}
		else
		{
			// Latin-1 fixing applies only to UTF-8
			return buffer;
		}
	}


	/**
	 * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a
	 * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are
	 * formally undefined by Windows 1252 and therefore replaced by a space
	 * (0x20).
	 *
	 * @param ch
	 *            an Cp1252 / Latin-1 byte
	 * @return Returns a byte array containing a UTF-8 byte sequence.
	 */
	private static byte[] convertToUTF8(byte ch)
	{
		int c = ch & 0xFF;
		try
		{
			if (c >= 0x80)
			{
				if (c == 0x81  ||  c == 0x8D  ||  c == 0x8F  ||  c == 0x90  ||  c == 0x9D)
				{
					return new byte[] { 0x20 }; // space for undefined
				}

				// interpret byte as Windows Cp1252 char
				return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8");
			}
		}
		catch (UnsupportedEncodingException e)
		{
			// EMPTY
		}
		return new byte[] { ch };
	}
}