All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bbottema.rtftohtml.impl.RTF2HTMLConverterClassic Maven / Gradle / Ivy

/*
 * Copyright © 2019 John Doe ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.bbottema.rtftohtml.impl;

import org.bbottema.rtftohtml.RTF2HTMLConverter;
import org.bbottema.rtftohtml.impl.util.CharsetHelper;
import org.bbottema.rtftohtml.impl.util.CodePage;
import org.jetbrains.annotations.NotNull;

import java.nio.charset.Charset;
import java.util.regex.Matcher;

import static java.util.regex.Pattern.compile;
import static org.bbottema.rtftohtml.impl.util.ByteUtil.hexToString;

/**
 * Simplistic best guess regex-based approach that always embeds the result in a {@code } tag (if not present) with basic reading setting.
 * 

* The results of this converter are pretty good, but ends up with a lot of invisible new lines in the resulting HTML. As this code is * a result of long-time organically home-grown regular expressions, the code is not as obvious as it would have been if it followed the * RTF RFC. *

* Also, the result-source deviates significantly from other software such as Outlook. On the outside, very complex RTF's might have slight * layout issues or stretched images. In general the result is pretty good though. */ public class RTF2HTMLConverterClassic implements RTF2HTMLConverter { public static final RTF2HTMLConverter INSTANCE = new RTF2HTMLConverterClassic(); private static final String[] HTML_START_TAGS = { "", "", "" }; private RTF2HTMLConverterClassic() {} @NotNull public String rtf2html(@NotNull final String rtf) { final Charset charset = extractCodepage(rtf); String plain = fetchHtmlSection(rtf); plain = replaceSpecialSequences(plain); // first step, remove known control words or else we'll match single escape hex values in the next step plain = replaceHexSequences(plain, "(?:\\\\f\\d(?:\\\\'..)+)", CodePage.WINDOWS_1252.getCharset()); // match all header control values with default charset plain = replaceHexSequences(plain, "(?:\\\\'..)+", charset); // match all remaining escaped hex values as encoded text (which might be DBCS like CP936) plain = cleanupRemainingSequences(plain); plain = replaceLineBreaks(plain); return plain; } private String cleanupRemainingSequences(String plain) { return plain .replaceAll("(\\\\f\\d.+?;)+", "") // clear all \f sequences including fontnames like Courier new .replaceAll("\\\\\\S+", "") // filtering all remaining \ like e.g.: \htmlrtf .replaceAll("BM__MailAutoSig((?s).*?(?-s))BM__MailAutoSig", "$1"); } private Charset extractCodepage(String rtf) { Matcher codePageMatcher = compile("(?:\\\\ansicpg(?.+?)\\\\)+").matcher(rtf); if (codePageMatcher.find()) { return CharsetHelper.findCharsetForCodePage(codePageMatcher.group("codePage")); } else { return CodePage.WINDOWS_1252.getCharset(); // fallback } } /** * @return The text with removed newlines as they are only part of the RTF document and should not be inside the HTML. */ private String replaceLineBreaks(final String text) { return text .replaceAll("(
(
)+)", "
") .replaceAll("\\r\\n", "\n") .replaceAll("[\\r\\u0000]", ""); } /** * @return The text with replaced special characters that denote hex codes for strings using Windows CP1252 encoding. */ private String replaceHexSequences(final String text, String sequencesToMatch, final Charset charset) { final StringBuilder res = new StringBuilder(); int lastPosition = 0; final Matcher escapedHexGroupMatcher = compile(sequencesToMatch).matcher(text); while (escapedHexGroupMatcher.find()) { res.append(text, lastPosition, escapedHexGroupMatcher.start()); StringBuilder hexText = new StringBuilder(); String escapedHexGroup = escapedHexGroupMatcher.group(0); final Matcher unescapedHexCharacterMatcher = compile("\\\\'(..)").matcher(escapedHexGroup); while (unescapedHexCharacterMatcher.find()) { hexText.append(unescapedHexCharacterMatcher.group(1)); } res.append(hexToString(hexText.toString(), charset)); lastPosition = escapedHexGroupMatcher.end(); } if (res.length() == 0) { res.append(text); } else { res.append(text, lastPosition, text.length()); } return res.toString(); } /** * @return The actual HTML block / section only but still with RTF code inside (still needs to be cleaned). */ private String fetchHtmlSection(final String text) { int htmlStart = -1; int htmlEnd = -1; //determine html tags for (int i = 0; i < HTML_START_TAGS.length && htmlStart < 0; i++) { htmlStart = text.indexOf(HTML_START_TAGS[i]); } for (int i = 0; i < HTML_END_TAGS.length && htmlEnd < 0; i++) { htmlEnd = text.indexOf(HTML_END_TAGS[i]); if (htmlEnd > 0) { htmlEnd = htmlEnd + HTML_END_TAGS[i].length(); } } if (htmlStart > -1 && htmlEnd > -1) { //trim rtf code return text.substring(htmlStart, htmlEnd + 1); } else { //embed code within html tags String html = "" + text + ""; //replace linebreaks with html breaks html = html.replaceAll("[\\n\\r]+", " "); //create hyperlinks html = html.replaceAll("(http://\\S+)", "$1"); return html.replaceAll("mailto:(\\S+@\\S+)", "$1"); } } /** * @return The text with special sequences replaced by equivalent representations. */ @SuppressWarnings("RegExpRedundantEscape") private String replaceSpecialSequences(final String text) { String replacedText = text; //filtering whatever color control sequence, e.g. {\sp{\sn fillColor}{\sv 14935011}}{\sp{\sn fFilled}{\sv 1}} replacedText = replacedText.replaceAll("\\{\\\\S+ [^\\s\\\\}]*\\}", ""); //filtering hyperlink sequences like {HYPERLINK "http://xyz.com/print.jpg"} replacedText = replacedText.replaceAll("\\{HYPERLINK[^\\}]*\\}", ""); //filtering plain replacedText sequences like {\pntext *\tab} replacedText = replacedText.replaceAll("\\{\\\\pntext[^\\}]*\\}", ""); //filtering embedded tags like {\*\htmltag84 +} replacedText = replacedText.replaceAll("\\{\\\\\\*\\\\htmltag\\d+ (&[#\\w]+;)}\\\\htmlrtf.*\\\\htmlrtf0 ", "$1"); //filtering curly braces that are NOT escaped with backslash }, thus marking the end of an RTF sequence replacedText = replacedText.replaceAll("([^\\\\])" + "\\}+", "$1"); replacedText = replacedText.replaceAll("([^\\\\])" + "\\{+", "$1"); //filtering curly braces that are escaped with backslash \}, thus representing an actual brace replacedText = replacedText.replaceAll("\\\\\\}", "}"); replacedText = replacedText.replaceAll("\\\\\\{", "{"); //filtering \par sequences replacedText = replacedText.replaceAll("\\\\pard*", "\n"); //filtering \tab sequences replacedText = replacedText.replaceAll("\\\\tab", "\t"); //filtering \*\ like e.g.: \*\fldinst replacedText = replacedText.replaceAll("\\\\\\*\\\\\\S+", ""); return replacedText; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy