org.bbottema.rtftohtml.impl.RTF2HTMLConverterClassic Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2019 John Doe ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.bbottema.rtftohtml.impl;

import org.bbottema.rtftohtml.RTF2HTMLConverter;
import org.bbottema.rtftohtml.impl.util.CharsetHelper;
import org.bbottema.rtftohtml.impl.util.CodePage;
import org.jetbrains.annotations.NotNull;

import java.nio.charset.Charset;
import java.util.regex.Matcher;

import static java.util.regex.Pattern.compile;
import static org.bbottema.rtftohtml.impl.util.ByteUtil.hexToString;

/**
 * Simplistic best guess regex-based approach that always embeds the result in a {@code } tag (if not present) with basic reading setting.
 * 
 * The results of this converter are pretty good, but ends up with a lot of invisible new lines in the resulting HTML. As this code is
 * a result of long-time organically home-grown regular expressions, the code is not as obvious as it would have been if it followed the
 * RTF RFC.
 * 
 * Also, the result-source deviates significantly from other software such as Outlook. On the outside, very complex RTF's might have slight
 * layout issues or stretched images. In general the result is pretty good though.
 */
public class RTF2HTMLConverterClassic implements RTF2HTMLConverter {
    
    public static final RTF2HTMLConverter INSTANCE = new RTF2HTMLConverterClassic();
    
    private static final String[] HTML_START_TAGS = { "", "", "" };
    
    private RTF2HTMLConverterClassic() {}
    
    @NotNull
    public String rtf2html(@NotNull final String rtf) {
        final Charset charset = extractCodepage(rtf);
        String plain = fetchHtmlSection(rtf);
        plain = replaceSpecialSequences(plain); // first step, remove known control words or else we'll match single escape hex values in the next step
        plain = replaceHexSequences(plain, "(?:\\\\f\\d(?:\\\\'..)+)", CodePage.WINDOWS_1252.getCharset()); // match all header control values with default charset
        plain = replaceHexSequences(plain, "(?:\\\\'..)+", charset); // match all remaining escaped hex values as encoded text (which might be DBCS like CP936)
        plain = cleanupRemainingSequences(plain);
        plain = replaceLineBreaks(plain);
        return plain;
    }
    
    private String cleanupRemainingSequences(String plain) {
        return plain
                .replaceAll("(\\\\f\\d.+?;)+", "") // clear all \f sequences including fontnames like Courier new
                .replaceAll("\\\\\\S+", "") // filtering all remaining \ like e.g.: \htmlrtf
                .replaceAll("BM__MailAutoSig((?s).*?(?-s))BM__MailAutoSig", "$1");
    }
    
    private Charset extractCodepage(String rtf) {
        Matcher codePageMatcher = compile("(?:\\\\ansicpg(?.+?)\\\\)+").matcher(rtf);
        if (codePageMatcher.find()) {
            return CharsetHelper.findCharsetForCodePage(codePageMatcher.group("codePage"));
        } else {
            return CodePage.WINDOWS_1252.getCharset(); // fallback
        }
    }
    
    /**
     * @return The text with removed newlines as they are only part of the RTF document and should not be inside the HTML.
     */
    private String replaceLineBreaks(final String text) {
        return text
                .replaceAll("( 
 ( 
 )+)", " 
 ")
                .replaceAll("\\r\\n", "\n")
                .replaceAll("[\\r\\u0000]", "");
    }
    
    /**
     * @return The text with replaced special characters that denote hex codes for strings using Windows CP1252 encoding.
     */
    private String replaceHexSequences(final String text, String sequencesToMatch, final Charset charset) {
        final StringBuilder res = new StringBuilder();
        int lastPosition = 0;
        
        final Matcher escapedHexGroupMatcher = compile(sequencesToMatch).matcher(text);
        while (escapedHexGroupMatcher.find()) {
            res.append(text, lastPosition, escapedHexGroupMatcher.start());
            
            StringBuilder hexText = new StringBuilder();
            
            String escapedHexGroup = escapedHexGroupMatcher.group(0);
            final Matcher unescapedHexCharacterMatcher = compile("\\\\'(..)").matcher(escapedHexGroup);
            while (unescapedHexCharacterMatcher.find()) {
                hexText.append(unescapedHexCharacterMatcher.group(1));
            }
            
            res.append(hexToString(hexText.toString(), charset));
            
            lastPosition = escapedHexGroupMatcher.end();
        }
        
        if (res.length() == 0) {
            res.append(text);
        } else {
            res.append(text, lastPosition, text.length());
        }
        
        return res.toString();
    }
    
    /**
     * @return The actual HTML block / section only but still with RTF code inside (still needs to be cleaned).
     */
    private String fetchHtmlSection(final String text) {
        int htmlStart = -1;
        int htmlEnd = -1;
        
        //determine html tags
        for (int i = 0; i < HTML_START_TAGS.length && htmlStart < 0; i++) {
            htmlStart = text.indexOf(HTML_START_TAGS[i]);
        }
        for (int i = 0; i < HTML_END_TAGS.length && htmlEnd < 0; i++) {
            htmlEnd = text.indexOf(HTML_END_TAGS[i]);
            if (htmlEnd > 0) {
                htmlEnd = htmlEnd + HTML_END_TAGS[i].length();
            }
        }
        
        if (htmlStart > -1 && htmlEnd > -1) {
            //trim rtf code
            return text.substring(htmlStart, htmlEnd + 1);
        } else {
            //embed code within html tags
            String html = "" + text + "";
            //replace linebreaks with html breaks
            html = html.replaceAll("[\\n\\r]+", " ");
            //create hyperlinks
            html = html.replaceAll("(http://\\S+)", "$1");
            return html.replaceAll("mailto:(\\S+@\\S+)", "$1");
        }
    }
    
    /**
     * @return The text with special sequences replaced by equivalent representations.
     */
    @SuppressWarnings("RegExpRedundantEscape")
    private String replaceSpecialSequences(final String text) {
        String replacedText = text;
        //filtering whatever color control sequence, e.g. {\sp{\sn fillColor}{\sv 14935011}}{\sp{\sn fFilled}{\sv 1}}
        replacedText = replacedText.replaceAll("\\{\\\\S+ [^\\s\\\\}]*\\}", "");
        //filtering hyperlink sequences like {HYPERLINK "http://xyz.com/print.jpg"}
        replacedText = replacedText.replaceAll("\\{HYPERLINK[^\\}]*\\}", "");
        //filtering plain replacedText sequences like {\pntext *\tab}
        replacedText = replacedText.replaceAll("\\{\\\\pntext[^\\}]*\\}", "");
        //filtering embedded tags like {\*\htmltag84 +}
        replacedText = replacedText.replaceAll("\\{\\\\\\*\\\\htmltag\\d+ (&[#\\w]+;)}\\\\htmlrtf.*\\\\htmlrtf0 ", "$1");
        //filtering curly braces that are NOT escaped with backslash }, thus marking the end of an RTF sequence
        replacedText = replacedText.replaceAll("([^\\\\])" + "\\}+", "$1");
        replacedText = replacedText.replaceAll("([^\\\\])" + "\\{+", "$1");
        //filtering curly braces that are escaped with backslash \}, thus representing an actual brace
        replacedText = replacedText.replaceAll("\\\\\\}", "}");
        replacedText = replacedText.replaceAll("\\\\\\{", "{");
        //filtering \par sequences
        replacedText = replacedText.replaceAll("\\\\pard*", "\n");
        //filtering \tab sequences
        replacedText = replacedText.replaceAll("\\\\tab", "\t");
        //filtering \*\ like e.g.: \*\fldinst
        replacedText = replacedText.replaceAll("\\\\\\*\\\\\\S+", "");
        return replacedText;
    }
}