org.bbottema.rtftohtml.impl.RTF2HTMLConverterRFCCompliant Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rtf-to-html Show documentation
RTF to HTML conversion done right
The newest version!
/*
 * Copyright © 2019 John Doe ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.bbottema.rtftohtml.impl;

import org.bbottema.rtftohtml.RTF2HTMLConverter;
import org.bbottema.rtftohtml.impl.util.CharsetHelper;
import org.bbottema.rtftohtml.impl.util.CodePage;
import org.jetbrains.annotations.NotNull;

import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.regex.Matcher;

import static java.util.Objects.requireNonNull;
import static org.bbottema.rtftohtml.impl.util.ByteUtil.hexToString;

/**
 * The last and most comprehensive converter that follows the RTF RFC and produces the most correct outcome.
 * 
 * Note: unlike {@link RTF2HTMLConverterClassic}, this converter doesn't wrap the result in
 * basic HTML tags if they're not already present in the RTF source.
 * 
 * The resulting source and rendered result is on par with software such as Outlook.
 */
public class RTF2HTMLConverterRFCCompliant implements RTF2HTMLConverter {

    public static final RTF2HTMLConverter INSTANCE = new RTF2HTMLConverterRFCCompliant();

    private RTF2HTMLConverterRFCCompliant() {}

    @NotNull
    public String rtf2html(@NotNull String rtf) {
        Map fontTable = new HashMap<>();
        Charset charset = CharsetHelper.detectCharsetFromRtfContent(rtf);

        // RTF processing requires stack holding current settings, each group adds new settings to stack
        LinkedList groupStack = new LinkedList<>();
        groupStack.add(new Group());

        Matcher controlWordMatcher = CONTROL_WORD.matcher(rtf);
        Matcher encodedCharMatcher = ENCODED_CHARACTER.matcher(rtf);
        StringBuilder result = new StringBuilder();
        int length = rtf.length();
        int charIndex = 0;

        while (charIndex < length) {
            char c = rtf.charAt(charIndex);
            Group currentGroup = groupStack.getFirst();
            if (c == '\r' || c == '\n') {
                charIndex++;
            } else if (c == '{') {  //entering group
                groupStack.addFirst(currentGroup.copy());
                charIndex++;
            } else if (c == '}') {  //exiting group
                groupStack.removeFirst();
                //Not outputting anything after last closing brace matching opening brace.
                if (groupStack.size() == 1) {
                    break;
                }
                charIndex++;
            } else if (c == '\\') {
                // matching ansi-encoded sequences like \'f5\'93
                encodedCharMatcher.region(charIndex, length);
                if (encodedCharMatcher.lookingAt()) {
                    StringBuilder encodedSequence = new StringBuilder();
                    while (encodedCharMatcher.lookingAt()) {
                        encodedSequence.append(encodedCharMatcher.group(1));
                        charIndex += 4;
                        encodedCharMatcher.region(charIndex, length);
                    }

                    Charset effectiveCharset = charset;
                    if (currentGroup.fontTableIndex != null) {
                        FontTableEntry entry = fontTable.get(currentGroup.fontTableIndex);
                        if (entry != null && entry.charset != null) {
                            effectiveCharset = entry.charset;
                        }
                    }

                    String decoded = hexToString(encodedSequence.toString(), effectiveCharset);
                    appendIfNotIgnoredGroup(result, decoded, currentGroup);
                    continue;
                }

                // set matcher to current char position and match from it
                controlWordMatcher.region(charIndex, length);
                if (!controlWordMatcher.lookingAt()) {
                    throw new IllegalStateException("RTF file has invalid structure. Failed to match character '" +
                            c + "' at [" + charIndex + "/" + length + "] to a control symbol or word.");
                }

                //checking for control symbol or control word
                //control word can have optional number following it and the optional space as well
                Integer controlNumber = null;
                String controlWord = controlWordMatcher.group(2); // group(2) matches control symbol
                if (controlWord == null) {
                    controlWord = controlWordMatcher.group(4); // group(4) matches control word
                    String controlNumberString = controlWordMatcher.group(5);
                    if (!"".equals(controlNumberString)) {
                        controlNumber = Integer.valueOf(controlNumberString);
                    }
                }
                charIndex += controlWordMatcher.end() - controlWordMatcher.start();

                switch (controlWord) {
                    case "par":
                        appendIfNotIgnoredGroup(result, "\n", currentGroup);
                        break;
                    case "tab":
                        appendIfNotIgnoredGroup(result, "\t", currentGroup);
                        break;
                    case "htmlrtf":
                        //htmlrtf starts ignored text area, htmlrtf0 ends it
                        //Though technically this is not a group, it's easier to treat it as such to ignore everything in between
                        currentGroup.htmlRtf = controlNumber == null;
                        break;
                    case "ansicpg":
                        //charset definition is important for decoding ansi encoded values
                        charset = CharsetHelper.findCharsetForCodePage(requireNonNull(controlNumber).toString());
                        break;
                    case "fonttbl": // skipping these groups' contents - these are font and color settings
                    case "colortbl":
                        currentGroup.ignore = true;
                        break;
                    case "f":
                        // font table index. Might be a new one, or an existing one
                        currentGroup.fontTableIndex = controlNumber;
                        break;
                    case "fcharset":
                        if (controlNumber != null && currentGroup.fontTableIndex != null) {
                            Charset possibleCharset = CodePage.getCharsetByCodePage(controlNumber);
                            if (possibleCharset != null) {
                                FontTableEntry entry = fontTable.get(currentGroup.fontTableIndex);
                                if (entry == null) {
                                    entry = new FontTableEntry();
                                    fontTable.put(currentGroup.fontTableIndex, entry);
                                }
                                entry.charset = possibleCharset;
                            }
                        }
                        break;
                    case "uc":
                        // This denotes a number of characters to skip after unicode symbols
                        currentGroup.unicodeCharLength = controlNumber == null ? 1 : controlNumber;
                        break;
                    case "u":
                        // Unicode symbols
                        if (controlNumber != null) {
                            char unicodeSymbol = (char) controlNumber.intValue();
                            appendIfNotIgnoredGroup(result, Character.toString(unicodeSymbol), currentGroup);
                            charIndex += currentGroup.unicodeCharLength;
                        }
                        break;
                    case "{":  // Escaped characters
                    case "}":
                    case "\\":
                        appendIfNotIgnoredGroup(result, controlWord, currentGroup);
                        break;
                    case "pntext":
                        currentGroup.ignore = true;
                        break;
                    default:
                }

            } else {
                appendIfNotIgnoredGroup(result, c + "", currentGroup);
                charIndex++;
            }
        }
        return result.toString();
    }

    private void appendIfNotIgnoredGroup(StringBuilder result, String symbol, Group group) {
        if (!group.ignore && !group.htmlRtf) {
            result.append(symbol);
        }
    }

    private static class Group {
        boolean ignore = false;
        int unicodeCharLength = 1;
        boolean htmlRtf = false;
        Integer fontTableIndex = null;

        Group copy() {
            Group newGroup = new Group();
            newGroup.ignore = this.ignore;
            newGroup.unicodeCharLength = this.unicodeCharLength;
            newGroup.htmlRtf = this.htmlRtf;
            // Don't inherit fontTableIndex from parent group.
            return newGroup;
        }
    }

    private static class FontTableEntry {
        Charset charset = null;
    }
}