com.day.cq.wcm.designimporter.util.StreamUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
The newest version!
package com.day.cq.wcm.designimporter.util;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.utils.CharsetUtils;
/**
* Class for stream related utility methods
*/
public class StreamUtil {
private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile("(?is)");
private static final int META_TAG_BUFFER_SIZE = 8192;
/**
* Determines the encoding in the passed html document stream by peeking into the <meta> tags or the xml declaration if it's an xhtml document
*
* @param stream The InputStream of the HTML document being imported
* @return The determined charset
* @throws IOException
*/
public static String getEncoding(InputStream stream) throws IOException {
//BufferedInputStream stream = new BufferedInputStream(inputStream); //to support mark and rest
stream.mark(META_TAG_BUFFER_SIZE);
char[] buffer = new char[META_TAG_BUFFER_SIZE];
InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
int bufferSize = isr.read(buffer);
stream.reset();
if (bufferSize != -1) {
String metaString = new String(buffer, 0, bufferSize);
Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
if (m.find()) {
// TIKA-349: flexible handling of attributes
// We have one or more x or x=y attributes, separated by ';'
String[] attrs = m.group(1).split(";");
for (String attr : attrs) {
String[] keyValue = attr.trim().split("=");
if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
// TIKA-459: improve charset handling.
String charset = CharsetUtils.clean(keyValue[1]);
if (CharsetUtils.isSupported(charset)) {
return charset;
}
}
}
}
// Pattern to match HTML5 meta charset
// example
m = META_CHARSET_PATTERN.matcher(metaString);
if (m.find()) {
String charset = m.group(1);
if (CharsetUtils.isSupported(charset)) {
return charset;
}
}
// Pattern to match XHTML encoding
// example
m = XHTML_ENCODING_PATTERN.matcher(metaString);
if (m.find()) {
String charset = m.group(1);
if (CharsetUtils.isSupported(charset)) {
return charset;
}
}
}
return "UTF-8";
}
}