patterntesting.tool.html.UmlautEncoder Maven / Gradle / Ivy
/*
* $Id: UmlautEncoder.java,v 1.10 2016/12/30 19:07:44 oboehm Exp $
*
* Copyright (c) 2010 by Oliver Boehm
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express orimplied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (c)reated 06.01.2011 by oliver ([email protected])
*/
package patterntesting.tool.html;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import patterntesting.runtime.util.Converter;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Although the encoding is correct defined in an HTML file it happens
* sometimes that Umlauts are not displayed correct. One reason for this
* incorrect display may be the webserver itself who delivers the website
* with a different encoding defined. To avoid such problems you can convert
* the Umlauts into their HTML representation using this class here.
*
* @author oliver
* @since 1.1 (06.01.2011)
*/
public final class UmlautEncoder {
private static final Logger LOG = LogManager.getLogger(UmlautEncoder.class);
/** Utility class - no need to instantiate it. */
private UmlautEncoder() {}
/**
* Encode a String.
*
* @param input the input
* @return the string
*/
public static String encode(final String input) {
char[] characters = input.toCharArray();
StringBuilder encoded = new StringBuilder(characters.length);
for (int i = 0; i < characters.length; i++) {
encoded.append(encode(characters[i]));
}
return encoded.toString();
}
/**
* Encode a single character.
*
* @param c a single character
* @return the encoded character
*/
private static String encode(final char c) {
switch (c) {
case '\u00e4':
return "ä";
case '\u00f6':
return "ö";
case '\u00fc':
return "ü";
case '\u00df':
return "ß";
case '\u00c4':
return "Ä";
case '\u00d6':
return "Ö";
case '\u00dc':
return "Ü";
default:
return Character.toString(c);
}
}
/**
* Encode a file.
*
* @param file the file
* @throws IOException Signals that an I/O exception has occurred.
*/
public static void encode(final File file) throws IOException {
encode(file, file);
}
/**
* Encode.
*
* @param from the from
* @param to the to
* @throws IOException Signals that an I/O exception has occurred.
*/
public static void encode(final File from, final File to) throws IOException {
if (from.isDirectory()) {
encodeDir(from, to);
} else {
encodeFile(from, to);
}
}
private static void encodeFile(final File from, final File to) throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace("encoding umlauts in " + from + " to " + to + "...");
}
String input = readFile(from);
String encoded = encode(input);
FileUtils.writeStringToFile(to, encoded, StandardCharsets.UTF_8);
}
private static String readFile(final File file) throws IOException {
Charset encoding = guessEncoding(file);
byte[] bytes = FileUtils.readFileToByteArray(file);
CharBuffer cb = encoding.decode(ByteBuffer.wrap(bytes));
return cb.toString();
}
/**
* Guess the encoding of an HTHML file. It looks for a meta tag as described
* in encoding. An
* XML tag at the beginning will be still ignored. Also comments are ignored
* in case a meta tag is commented out.
*
* Note: The CharsetToolkit of
* guessencoding was used
* before but it does not work as expected.
*
*
* @param file must be an HTML file
* @return The encoding the file uses (or default if not apparent).
* @throws IOException Signals that an I/O exception has occurred.
*/
public static Charset guessEncoding(final File file) throws IOException {
String content = FileUtils.readFileToString(file, StandardCharsets.UTF_8);
content = deleteComments(content);
Matcher matcher = getMatcherFor("]+?\\bcharset=([^'\"]+)");
matcher.reset(content);
if (matcher.find()) {
String charsetName = matcher.group(1);
if (StringUtils.isNotEmpty(charsetName)) {
return Charset.forName(charsetName);
}
}
return Charset.defaultCharset();
}
private static String deleteComments(final String content) throws IOException {
Matcher commentMatcher = getMatcherFor("");
commentMatcher.reset(content);
return commentMatcher.replaceAll("");
}
private static Matcher getMatcherFor(final String regex) {
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher commentMatcher = pattern.matcher("");
return commentMatcher;
}
private static void encodeDir(final File from, final File to) throws IOException {
if (!to.exists()) {
if (!to.mkdir()) {
throw new IOException("can't create dir " + to);
}
LOG.debug("created: dir " + to);
}
File[] files = from.listFiles();
if (files == null) {
throw new IllegalArgumentException("not a directory: " + to);
}
for (int i = 0; i < files.length; i++) {
File dest = new File(to, files[i].getName());
if (files[i].isDirectory()) {
encodeDir(files[i], dest);
} else if (hasHtmlSuffix(files[i])) {
encodeFile(files[i], dest);
} else if (!files[i].equals(dest)) {
FileUtils.copyFile(files[i], dest);
}
}
}
private static boolean hasHtmlSuffix(final File file) {
String suffix = FilenameUtils.getExtension(file.getName());
return ("html".equalsIgnoreCase(suffix)
|| "htm".equalsIgnoreCase(suffix)
|| "xhtml".equalsIgnoreCase(suffix));
}
/**
* You can encode a single file (if the first argument is a file) or a
* whole directory tree (if the first argument is a directory).
* Optionally you can determine a destination file or directory.
* If not a second argument is given the original file will be overwritten
* with the encoded variant.
*
* @param args the src dir or file and (optionally) the destination
*/
public static void main(final String[] args) {
try {
switch (args.length) {
case 1:
encode(new File(args[0]));
break;
case 2:
encode(new File(args[0]), new File(args[1]));
break;
default:
System.err.println("usage: " + UmlautEncoder.class.getName()
+ " src-file|dir [dest-file|dir]");
System.exit(1);
}
} catch (IOException ioe) {
LOG.error("main(" + Converter.toString(args) + ") failed", ioe);
System.err.println("command failed: " + ioe.getLocalizedMessage());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy