org.bbottema.rtftohtml.impl.util.CharsetHelper Maven / Gradle / Ivy
/*
* Copyright © 2019 John Doe ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bbottema.rtftohtml.impl.util;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.Optional.ofNullable;
import static org.bbottema.rtftohtml.impl.util.CodePage.*;
public class CharsetHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(CharsetHelper.class);
private static final String[] CHARSET_PREFIXES = {"", "cp", "iso-", "ibm", "x-windows-", "ms"};
public static Charset findCharsetForCodePage(String rtfCodePage) {
return rtfCodePage.equals("65001") || rtfCodePage.equalsIgnoreCase("cp65001")
? StandardCharsets.UTF_8
: detectCharset(rtfCodePage);
}
private static Charset detectCharset(String rtfCodePage) {
for (String prefix : CHARSET_PREFIXES) {
try {
return Charset.forName(prefix + rtfCodePage);
} catch (UnsupportedCharsetException ignore) {
// ignore
}
}
throw new UnsupportedCharsetException(rtfCodePage);
}
public static Charset detectCharsetFromRtfContent(String rtfContent) {
return ofNullable(detectCharsetByAnsicpg(rtfContent))
.orElse(WINDOWS_1252.getCharset());
}
@Nullable
public static Charset detectCharsetByAnsicpg(String rtfContent) {
Matcher matcher = Pattern.compile("\\\\ansicpg(\\d+)").matcher(rtfContent);
if (matcher.find()) {
int codePage = Integer.parseInt(matcher.group(1));
try {
return CodePage.getCharsetByCodePage(codePage);
} catch (Exception e) {
LOGGER.warn("Failed to detect charset from ansicpg: {}", codePage, e);
return null;
}
}
return null;
}
} © 2015 - 2025 Weber Informatics LLC | Privacy Policy