All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.day.cq.wcm.designimporter.util.StreamUtil Maven / Gradle / Ivy

The newest version!
package com.day.cq.wcm.designimporter.util;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.utils.CharsetUtils;

/**
 * Class for stream related utility methods
 */
public class StreamUtil {

	private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile("(?is)");

	private static final int META_TAG_BUFFER_SIZE = 8192;

    /**
     * Determines the encoding in the passed html document stream by peeking into the <meta> tags or the xml declaration if it's an xhtml document
     *
     * @param stream The InputStream of the HTML document being imported
     * @return The determined charset
     * @throws IOException
     */
	public static String getEncoding(InputStream stream) throws IOException {
		
		
		//BufferedInputStream stream = new BufferedInputStream(inputStream); //to support mark and rest
		stream.mark(META_TAG_BUFFER_SIZE);
		char[] buffer = new char[META_TAG_BUFFER_SIZE];
		InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
		int bufferSize = isr.read(buffer);
		stream.reset();

		if (bufferSize != -1) {
			String metaString = new String(buffer, 0, bufferSize);
			Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
			if (m.find()) {
				// TIKA-349: flexible handling of attributes
				// We have one or more x or x=y attributes, separated by ';'
				String[] attrs = m.group(1).split(";");
				for (String attr : attrs) {
					String[] keyValue = attr.trim().split("=");
					if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
						// TIKA-459: improve charset handling.
						String charset = CharsetUtils.clean(keyValue[1]);
						if (CharsetUtils.isSupported(charset)) {
							return charset;
						}
					}
				}
			}
			// Pattern to match HTML5 meta charset
			// example 
			m = META_CHARSET_PATTERN.matcher(metaString);
			if (m.find()) {
				String charset = m.group(1);
				if (CharsetUtils.isSupported(charset)) {
					return charset;
				}
			}

			// Pattern to match XHTML encoding
			// example 
			m = XHTML_ENCODING_PATTERN.matcher(metaString);
			if (m.find()) {
				String charset = m.group(1);
				if (CharsetUtils.isSupported(charset)) {
					return charset;
				}
			}
		}

		return "UTF-8";
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy