de.cuioss.tools.string.TextSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cui-java-tools Show documentation
Utility Library acting as a replacement for googles guava, certain apache-commons libraries and logging facades/frameworks.
There is a newer version: 2.1.1
Show newest version
/*
 * Copyright 2023 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 

 * https://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.cuioss.tools.string;

import static de.cuioss.tools.string.MoreStrings.isEmpty;
import static de.cuioss.tools.string.MoreStrings.nullToEmpty;
import static java.lang.Integer.valueOf;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;

import de.cuioss.tools.collect.MapBuilder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;

/**
 * This class provide functionality to transform long text to several html
 * useful representation and encapsulate this as an object. It is implemented as
 * an value-object keeping the calculated text.
 *
 * @author Eugen Fischer
 */
@EqualsAndHashCode(of = { "source", "abridgedLength", "forceLengthBreak" })
@ToString(of = { "source", "abridgedLength", "forceLengthBreak" })
public class TextSplitter implements Serializable {

    /** serial Version UID */
    private static final long serialVersionUID = 6594890288982910944L;

    /**
     * Der Browser muss über Sollbruchstellen die Möglichkeit bekommen lange
     * Wortketten zu trennen/umzubrechen. Dafür gibt es zwei unsichtbare Zeichen,
     * die in den HTML code eingebaut werden können: "&shy;" und "&#8203;".
     * Der Unterschied zwischen beiden ist, dass das eine einfach ein Leerzeichen
     * ohne breite ist, welches beim Umbruch keine Spuren hinterlässt, das andere
     * fügt bei einem Umbruch einen Bindestrich hinzu. Eignet sich also zur
     * Silbentrennung.
     */
    private static final String ZERO_WIDTH_SPACE = "\u200B";

    private static final String TRADE_STR = "...";

    private static final int DEFAULT_FORCE_LENGTH_BREAK = 15;

    private static final int DEFAULT_ABRIDGED_LENGTH = 20;

    private static final Map REPLACEMENT_MAP = new MapBuilder()
            .put(Pattern.compile("#"), "#" + ZERO_WIDTH_SPACE).put(Pattern.compile("\\+"), "+" + ZERO_WIDTH_SPACE)
            .put(Pattern.compile("-"), "-" + ZERO_WIDTH_SPACE).put(Pattern.compile("_"), "_" + ZERO_WIDTH_SPACE)
            .put(Pattern.compile("\\."), "." + ZERO_WIDTH_SPACE).put(Pattern.compile("\\?"), "?" + ZERO_WIDTH_SPACE)
            .put(Pattern.compile("!"), "!" + ZERO_WIDTH_SPACE).put(Pattern.compile(":"), ":" + ZERO_WIDTH_SPACE)
            .put(Pattern.compile(","), "," + ZERO_WIDTH_SPACE).put(Pattern.compile(";"), ";" + ZERO_WIDTH_SPACE)
            .toImmutableMap();

    private final String source;

    @Getter(lazy = true)
    private final String abridgedText = initAbridged();

    @Getter
    private boolean abridged = false;

    @Getter(lazy = true)
    private final String textWithEnforcedLineBreaks = initTextWithLineBreaks();

    @Setter
    private Integer forceLengthBreak = null;

    @Setter
    private Integer abridgedLength = null;

    /**
     * Construct TextSplitter.
     *
     * @param longString source text which will be processed
     */
    public TextSplitter(final String longString) {
        source = nullToEmpty(longString);
    }

    /**
     * Alternative Constructor
     *
     * @param source                target text
     * @param forceLengthBreakCount count of characters when a text break will
     *                              forced
     * @param abridgedLengthCount   count of characters
     */
    public TextSplitter(final String source, final int forceLengthBreakCount, final int abridgedLengthCount) {

        this.source = source;
        forceLengthBreak = valueOf(forceLengthBreakCount);
        abridgedLength = valueOf(abridgedLengthCount);
    }

    private int getForceLengthBreak() {
        if (null == forceLengthBreak) {
            return DEFAULT_FORCE_LENGTH_BREAK;
        }
        return forceLengthBreak;
    }

    private int getAbridgedLength() {
        if (null == abridgedLength) {
            return DEFAULT_ABRIDGED_LENGTH;
        }
        return abridgedLength;
    }

    private String initAbridged() {
        var result = "";

        if (!isEmpty(source)) {

            final var sourceSplitted = getSourceSplitted();

            if (sourceSplitted.size() == 1) {
                result = abridgeComputerProducedText();
            } else {
                result = abridgeHumanProducedText(sourceSplitted);
            }
        }

        abridged = endsWith(result, TRADE_STR);

        return result.trim();
    }

    private static boolean endsWith(final String str, final String suffix) {
        return str.trim().endsWith(suffix);
    }

    /**
     * @return abridged text
     */
    private String abridgeComputerProducedText() {
        final var maxLength = getAbridgedLength() - (TRADE_STR.length() + 1);
        if (source.length() > maxLength) {
            return source.substring(0, maxLength) + " ...";
        }
        return source;
    }

    /**
     * @param sourceSplitted
     * @return abridged text
     */
    private String abridgeHumanProducedText(final List sourceSplitted) {
        final var maxLength = getAbridgedLength() - TRADE_STR.length();
        final var builder = new StringBuilder();
        var count = 0;
        for (final String part : sourceSplitted) {
            count = count + part.length();
            if (count >= maxLength) {
                builder.append(TRADE_STR);
                break;
            }

            builder.append(part).append(" ");

            count = count + 1;
        }
        return builder.toString();
    }

    private String initTextWithLineBreaks() {

        var result = "";

        if (!isEmpty(source)) {
            final var sourceSplitted = getSourceSplitted();
            if (sourceSplitted.size() == 1) {
                result = forceLineBreakForComputerProducedText(source);
            } else {
                result = forceLineBreakForHumanProducedText(sourceSplitted);
            }
        }

        return result.trim();
    }

    private String forceLineBreakForHumanProducedText(final List sourceSplitted) {
        final var builder = new StringBuilder();
        for (final String text : sourceSplitted) {
            builder.append(forceLineBreakForComputerProducedText(text)).append(" ");
        }
        return builder.toString();
    }

    /**
     * Try to separate text target on native text breaks. If this is not enough use
     * brute-force on max allowed length.
     *
     * @param text target which will be analyzed
     * @return
     */
    private String forceLineBreakForComputerProducedText(final String text) {

        // try to separate on native text breaks
        var clean = text;
        for (final Entry entry : REPLACEMENT_MAP.entrySet()) {
            final var matcher = entry.getKey().matcher(clean);
            clean = matcher.replaceAll(entry.getValue());
        }

        final var splittedByZeroWidthSpace = getSplittedByZeroWidthSpace(clean);
        final List lengthTrimed = new ArrayList<>();

        for (final String item : splittedByZeroWidthSpace) {
            lengthTrimed.add(bruteForceSplit(item));
        }

        return Joiner.on(ZERO_WIDTH_SPACE).join(lengthTrimed);
    }

    /**
     * Verify if very long text still exists and execute brute-force dissipation
     *
     * @param text target
     * @return fragmented text if length doesn't fit to force length break
     */
    private String bruteForceSplit(final String text) {
        final var maxLength = getForceLengthBreak();
        if (!isEmpty(text)) {
            final var builder = new StringBuilder();
            var tmp = text;
            while (tmp.length() > maxLength) {
                builder.append(tmp, 0, maxLength).append(ZERO_WIDTH_SPACE);
                tmp = tmp.substring(maxLength);
            }
            if (!tmp.isEmpty()) {
                builder.append(tmp);
            }
            return builder.toString();
        }
        return text;
    }

    private static List getSplittedByZeroWidthSpace(final String value) {
        return Splitter.on(ZERO_WIDTH_SPACE).splitToList(value);
    }

    private List getSourceSplitted() {
        return Splitter.on(" ").splitToList(source);
    }
}