All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.cuioss.tools.string.TextSplitter Maven / Gradle / Ivy

Go to download

Utility Library acting as a replacement for googles guava, certain apache-commons libraries and logging facades/frameworks.

There is a newer version: 2.1.1
Show newest version
/*
 * Copyright 2023 the original author or authors.
 * 

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

* https://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.cuioss.tools.string; import static de.cuioss.tools.string.MoreStrings.isEmpty; import static de.cuioss.tools.string.MoreStrings.nullToEmpty; import static java.lang.Integer.valueOf; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Pattern; import de.cuioss.tools.collect.MapBuilder; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; import lombok.ToString; /** * This class provide functionality to transform long text to several html * useful representation and encapsulate this as an object. It is implemented as * an value-object keeping the calculated text. * * @author Eugen Fischer */ @EqualsAndHashCode(of = { "source", "abridgedLength", "forceLengthBreak" }) @ToString(of = { "source", "abridgedLength", "forceLengthBreak" }) public class TextSplitter implements Serializable { /** serial Version UID */ private static final long serialVersionUID = 6594890288982910944L; /** * Der Browser muss über Sollbruchstellen die Möglichkeit bekommen lange * Wortketten zu trennen/umzubrechen. Dafür gibt es zwei unsichtbare Zeichen, * die in den HTML code eingebaut werden können: "&shy;" und "&#8203;". * Der Unterschied zwischen beiden ist, dass das eine einfach ein Leerzeichen * ohne breite ist, welches beim Umbruch keine Spuren hinterlässt, das andere * fügt bei einem Umbruch einen Bindestrich hinzu. Eignet sich also zur * Silbentrennung. */ private static final String ZERO_WIDTH_SPACE = "\u200B"; private static final String TRADE_STR = "..."; private static final int DEFAULT_FORCE_LENGTH_BREAK = 15; private static final int DEFAULT_ABRIDGED_LENGTH = 20; private static final Map REPLACEMENT_MAP = new MapBuilder() .put(Pattern.compile("#"), "#" + ZERO_WIDTH_SPACE).put(Pattern.compile("\\+"), "+" + ZERO_WIDTH_SPACE) .put(Pattern.compile("-"), "-" + ZERO_WIDTH_SPACE).put(Pattern.compile("_"), "_" + ZERO_WIDTH_SPACE) .put(Pattern.compile("\\."), "." + ZERO_WIDTH_SPACE).put(Pattern.compile("\\?"), "?" + ZERO_WIDTH_SPACE) .put(Pattern.compile("!"), "!" + ZERO_WIDTH_SPACE).put(Pattern.compile(":"), ":" + ZERO_WIDTH_SPACE) .put(Pattern.compile(","), "," + ZERO_WIDTH_SPACE).put(Pattern.compile(";"), ";" + ZERO_WIDTH_SPACE) .toImmutableMap(); private final String source; @Getter(lazy = true) private final String abridgedText = initAbridged(); @Getter private boolean abridged = false; @Getter(lazy = true) private final String textWithEnforcedLineBreaks = initTextWithLineBreaks(); @Setter private Integer forceLengthBreak = null; @Setter private Integer abridgedLength = null; /** * Construct TextSplitter. * * @param longString source text which will be processed */ public TextSplitter(final String longString) { source = nullToEmpty(longString); } /** * Alternative Constructor * * @param source target text * @param forceLengthBreakCount count of characters when a text break will * forced * @param abridgedLengthCount count of characters */ public TextSplitter(final String source, final int forceLengthBreakCount, final int abridgedLengthCount) { this.source = source; forceLengthBreak = valueOf(forceLengthBreakCount); abridgedLength = valueOf(abridgedLengthCount); } private int getForceLengthBreak() { if (null == forceLengthBreak) { return DEFAULT_FORCE_LENGTH_BREAK; } return forceLengthBreak; } private int getAbridgedLength() { if (null == abridgedLength) { return DEFAULT_ABRIDGED_LENGTH; } return abridgedLength; } private String initAbridged() { var result = ""; if (!isEmpty(source)) { final var sourceSplitted = getSourceSplitted(); if (sourceSplitted.size() == 1) { result = abridgeComputerProducedText(); } else { result = abridgeHumanProducedText(sourceSplitted); } } abridged = endsWith(result, TRADE_STR); return result.trim(); } private static boolean endsWith(final String str, final String suffix) { return str.trim().endsWith(suffix); } /** * @return abridged text */ private String abridgeComputerProducedText() { final var maxLength = getAbridgedLength() - (TRADE_STR.length() + 1); if (source.length() > maxLength) { return source.substring(0, maxLength) + " ..."; } return source; } /** * @param sourceSplitted * @return abridged text */ private String abridgeHumanProducedText(final List sourceSplitted) { final var maxLength = getAbridgedLength() - TRADE_STR.length(); final var builder = new StringBuilder(); var count = 0; for (final String part : sourceSplitted) { count = count + part.length(); if (count >= maxLength) { builder.append(TRADE_STR); break; } builder.append(part).append(" "); count = count + 1; } return builder.toString(); } private String initTextWithLineBreaks() { var result = ""; if (!isEmpty(source)) { final var sourceSplitted = getSourceSplitted(); if (sourceSplitted.size() == 1) { result = forceLineBreakForComputerProducedText(source); } else { result = forceLineBreakForHumanProducedText(sourceSplitted); } } return result.trim(); } private String forceLineBreakForHumanProducedText(final List sourceSplitted) { final var builder = new StringBuilder(); for (final String text : sourceSplitted) { builder.append(forceLineBreakForComputerProducedText(text)).append(" "); } return builder.toString(); } /** * Try to separate text target on native text breaks. If this is not enough use * brute-force on max allowed length. * * @param text target which will be analyzed * @return */ private String forceLineBreakForComputerProducedText(final String text) { // try to separate on native text breaks var clean = text; for (final Entry entry : REPLACEMENT_MAP.entrySet()) { final var matcher = entry.getKey().matcher(clean); clean = matcher.replaceAll(entry.getValue()); } final var splittedByZeroWidthSpace = getSplittedByZeroWidthSpace(clean); final List lengthTrimed = new ArrayList<>(); for (final String item : splittedByZeroWidthSpace) { lengthTrimed.add(bruteForceSplit(item)); } return Joiner.on(ZERO_WIDTH_SPACE).join(lengthTrimed); } /** * Verify if very long text still exists and execute brute-force dissipation * * @param text target * @return fragmented text if length doesn't fit to force length break */ private String bruteForceSplit(final String text) { final var maxLength = getForceLengthBreak(); if (!isEmpty(text)) { final var builder = new StringBuilder(); var tmp = text; while (tmp.length() > maxLength) { builder.append(tmp, 0, maxLength).append(ZERO_WIDTH_SPACE); tmp = tmp.substring(maxLength); } if (!tmp.isEmpty()) { builder.append(tmp); } return builder.toString(); } return text; } private static List getSplittedByZeroWidthSpace(final String value) { return Splitter.on(ZERO_WIDTH_SPACE).splitToList(value); } private List getSourceSplitted() { return Splitter.on(" ").splitToList(source); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy