patterntesting.tool.html.UmlautEncoder Maven / Gradle / Ivy

Go to download
/*
 * $Id: UmlautEncoder.java,v 1.10 2016/12/30 19:07:44 oboehm Exp $
 *
 * Copyright (c) 2010 by Oliver Boehm
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express orimplied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * (c)reated 06.01.2011 by oliver ([email protected])
 */

package patterntesting.tool.html;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import patterntesting.runtime.util.Converter;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Although the encoding is correct defined in an HTML file it happens
 * sometimes that Umlauts are not displayed correct. One reason for this
 * incorrect display may be the webserver itself who delivers the website
 * with a different encoding defined. To avoid such problems you can convert
 * the Umlauts into their HTML representation using this class here.
 *
 * @author oliver
 * @since 1.1 (06.01.2011)
 */
public final class UmlautEncoder {

    private static final Logger LOG = LogManager.getLogger(UmlautEncoder.class);

    /** Utility class - no need to instantiate it. */
    private UmlautEncoder() {}

    /**
     * Encode a String.
     *
     * @param input the input
     * @return the string
     */
    public static String encode(final String input) {
        char[] characters = input.toCharArray();
        StringBuilder encoded = new StringBuilder(characters.length);
        for (int i = 0; i < characters.length; i++) {
            encoded.append(encode(characters[i]));
        }
        return encoded.toString();
    }

    /**
     * Encode a single character.
     *
     * @param c a single character
     * @return the encoded character
     */
    private static String encode(final char c) {
        switch (c) {
        case '\u00e4':
            return "ä";
        case '\u00f6':
            return "ö";
        case '\u00fc':
            return "ü";
        case '\u00df':
            return "ß";
        case '\u00c4':
            return "Ä";
        case '\u00d6':
            return "Ö";
        case '\u00dc':
            return "Ü";
        default:
            return Character.toString(c);
        }
    }

    /**
     * Encode a file.
     *
     * @param file the file
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public static void encode(final File file) throws IOException {
        encode(file, file);
    }

    /**
     * Encode.
     *
     * @param from the from
     * @param to the to
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public static void encode(final File from, final File to) throws IOException {
        if (from.isDirectory()) {
            encodeDir(from, to);
        } else {
            encodeFile(from, to);
        }
    }

    private static void encodeFile(final File from, final File to) throws IOException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("encoding umlauts in " + from + " to " + to + "...");
        }
        String input = readFile(from);
        String encoded = encode(input);
        FileUtils.writeStringToFile(to, encoded, StandardCharsets.UTF_8);
    }

    private static String readFile(final File file) throws IOException {
        Charset encoding = guessEncoding(file);
        byte[] bytes = FileUtils.readFileToByteArray(file);
        CharBuffer cb = encoding.decode(ByteBuffer.wrap(bytes));
        return cb.toString();
    }

    /**
     * Guess the encoding of an HTHML file. It looks for a meta tag as described
     * in encoding. An
     * XML tag at the beginning will be still ignored. Also comments are ignored
     * in case a meta tag is commented out.
     * 
     * Note: The CharsetToolkit of
     * guessencoding was used
     * before but it does not work as expected.
     * 
     *
     * @param file must be an HTML file
     * @return The encoding the file uses (or default if not apparent).
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public static Charset guessEncoding(final File file) throws IOException {
        String content = FileUtils.readFileToString(file, StandardCharsets.UTF_8);
        content = deleteComments(content);
        Matcher matcher = getMatcherFor("]+?\\bcharset=([^'\"]+)");
        matcher.reset(content);
        if (matcher.find()) {
            String charsetName = matcher.group(1);
            if (StringUtils.isNotEmpty(charsetName)) {
                return Charset.forName(charsetName);
            }
        }
        return Charset.defaultCharset();
    }

    private static String deleteComments(final String content) throws IOException {
        Matcher commentMatcher = getMatcherFor("");
        commentMatcher.reset(content);
        return commentMatcher.replaceAll("");
    }

    private static Matcher getMatcherFor(final String regex) {
        Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
        Matcher commentMatcher = pattern.matcher("");
        return commentMatcher;
    }

    private static void encodeDir(final File from, final File to) throws IOException {
        if (!to.exists()) {
            if (!to.mkdir()) {
                throw new IOException("can't create dir " + to);
            }
            LOG.debug("created: dir " + to);
        }
        File[] files = from.listFiles();
        if (files == null) {
            throw new IllegalArgumentException("not a directory: " + to);
        }
        for (int i = 0; i < files.length; i++) {
            File dest = new File(to, files[i].getName());
            if (files[i].isDirectory()) {
                encodeDir(files[i], dest);
            } else if (hasHtmlSuffix(files[i])) {
                encodeFile(files[i], dest);
            } else if (!files[i].equals(dest)) {
                FileUtils.copyFile(files[i], dest);
            }
        }
    }

    private static boolean hasHtmlSuffix(final File file) {
        String suffix = FilenameUtils.getExtension(file.getName());
        return ("html".equalsIgnoreCase(suffix)
                || "htm".equalsIgnoreCase(suffix)
                || "xhtml".equalsIgnoreCase(suffix));
    }

    /**
     * You can encode a single file (if the first argument is a file) or a
     * whole directory tree (if the first argument is a directory).
     * Optionally you can determine a destination file or directory.
     * If not a second argument is given the original file will be overwritten
     * with the encoded variant.
     *
     * @param args the src dir or file and (optionally) the destination
     */
    public static void main(final String[] args) {
        try {
            switch (args.length) {
            case 1:
                encode(new File(args[0]));
                break;
            case 2:
                encode(new File(args[0]), new File(args[1]));
                break;
            default:
                System.err.println("usage: " + UmlautEncoder.class.getName()
                        + " src-file|dir [dest-file|dir]");
                System.exit(1);
            }
        } catch (IOException ioe) {
            LOG.error("main(" + Converter.toString(args) + ") failed", ioe);
            System.err.println("command failed: " + ioe.getLocalizedMessage());
        }
    }

}