All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bbottema.rtftohtml.impl.RTF2HTMLConverterRFCCompliant Maven / Gradle / Ivy

There is a newer version: 1.1.1
Show newest version
/*
 * Copyright (C) ${project.inceptionYear} Benny Bottema ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.bbottema.rtftohtml.impl;

import org.bbottema.rtftohtml.RTF2HTMLConverter;
import org.bbottema.rtftohtml.impl.util.CharsetHelper;
import org.jetbrains.annotations.NotNull;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.Objects.requireNonNull;
import static org.bbottema.rtftohtml.impl.util.ByteUtil.hexToString;
import static org.bbottema.rtftohtml.impl.util.CharsetHelper.WINDOWS_CHARSET;

/**
 * The last and most comprehensive converter that follows the RTF RFC and produces the most correct outcome.
 * 

* Note: unlike {@link RTF2HTMLConverterClassic}, this converter doesn't wrap the result in * basic HTML tags if they're not already present in the RTF source. *

* The resulting source and rendered result is on par with software such as Outlook. */ public class RTF2HTMLConverterRFCCompliant implements RTF2HTMLConverter { public static final RTF2HTMLConverter INSTANCE = new RTF2HTMLConverterRFCCompliant(); private static Pattern CONTROL_WORD = Pattern.compile("\\\\(([^a-zA-Z])|(([a-zA-Z]+)(-?[\\d]*) ?))"); private static Pattern ENCODED_CHARACTER = Pattern.compile("\\\\'([0-9a-fA-F]{2})"); private RTF2HTMLConverterRFCCompliant() {} @NotNull public String rtf2html(@NotNull String rtf) { Charset charset = WINDOWS_CHARSET; // RTF processing requires stack holding current settings, each group adds new settings to stack LinkedList groupStack = new LinkedList<>(); groupStack.add(new Group()); Matcher controlWordMatcher = CONTROL_WORD.matcher(rtf); Matcher encodedCharMatcher = ENCODED_CHARACTER.matcher(rtf); StringBuilder result = new StringBuilder(); int length = rtf.length(); int charIndex = 0; while (charIndex < length) { char c = rtf.charAt(charIndex); Group currentGroup = groupStack.getFirst(); if (c == '\r' || c == '\n') { charIndex++; } else if (c == '{') { //entering group groupStack.addFirst(currentGroup.copy()); charIndex++; } else if (c == '}') { //exiting group groupStack.removeFirst(); //Not outputting anything after last closing brace matching opening brace. if (groupStack.size() == 1) { break; } charIndex++; } else if (c == '\\') { // matching ansi-encoded sequences like \'f5\'93 encodedCharMatcher.region(charIndex, length); if (encodedCharMatcher.lookingAt()) { StringBuilder encodedSequence = new StringBuilder(); while (encodedCharMatcher.lookingAt()) { encodedSequence.append(encodedCharMatcher.group(1)); charIndex += 4; encodedCharMatcher.region(charIndex, length); } String decoded = hexToString(encodedSequence.toString(), charset); append(result, decoded, currentGroup); continue; } // set matcher to current char position and match from it controlWordMatcher.region(charIndex, length); if (!controlWordMatcher.lookingAt()) { throw new IllegalStateException("RTF file has invalid structure. Failed to match character '" + c + "' at [" + charIndex + "/" + length + "] to a control symbol or word."); } //checking for control symbol or control word //control word can have optional number following it and the option space as well Integer controlNumber = null; String controlWord = controlWordMatcher.group(2); // group(2) matches control symbol if (controlWord == null) { controlWord = controlWordMatcher.group(4); // group(2) matches control word String controlNumberString = controlWordMatcher.group(5); if (!"".equals(controlNumberString)) { controlNumber = Integer.valueOf(controlNumberString); } } charIndex += controlWordMatcher.end() - controlWordMatcher.start(); switch (controlWord) { case "par": append(result, "\n", currentGroup); break; case "tab": append(result, "\t", currentGroup); break; case "htmlrtf": //htmlrtf starts ignored text area, htmlrtf0 ends it //Though technically this is not a group, it's easier to treat it as such to ignore everything in between currentGroup.htmlRtf = controlNumber == null; break; case "ansicpg": //charset definition is important for decoding ansi encoded values charset = CharsetHelper.findCharset(requireNonNull(controlNumber).toString()); break; case "fonttbl": // skipping these groups contents - these are font and color settings case "colortbl": currentGroup.ignore = true; break; case "uc": // This denotes a number of characters to skip after unicode symbols currentGroup.unicodeCharLength = controlNumber == null ? 1 : controlNumber; break; case "u": // Unicode symbols if (controlNumber != null) { char unicodeSymbol = (char) controlNumber.intValue(); append(result, Character.toString(unicodeSymbol), currentGroup); charIndex += currentGroup.unicodeCharLength; } break; case "{": // Escaped characters case "}": case "\\": append(result, controlWord, currentGroup); break; default: } } else { append(result, c + "", currentGroup); charIndex++; } } return result.toString(); } private void append(StringBuilder result, String symbol, Group group) { if (!group.ignore && !group.htmlRtf) { result.append(symbol); } } private static class Group { boolean ignore = false; int unicodeCharLength = 1; boolean htmlRtf = false; Group copy() { Group newGroup = new Group(); newGroup.ignore = this.ignore; newGroup.unicodeCharLength = this.unicodeCharLength; newGroup.htmlRtf = this.htmlRtf; return newGroup; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy