org.daisy.pipeline.braille.common.AbstractHyphenator Maven / Gradle / Ivy
The newest version!
package org.daisy.pipeline.braille.common;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import cz.vutbr.web.css.CSSProperty;
import org.daisy.braille.css.BrailleCSSProperty.Hyphens;
import org.daisy.braille.css.SimpleInlineStyle;
import static org.daisy.pipeline.braille.common.util.Strings.extractHyphens;
import static org.daisy.pipeline.braille.common.util.Strings.insertHyphens;
import static org.daisy.pipeline.braille.common.util.Strings.join;
import static org.daisy.pipeline.braille.common.util.Strings.splitInclDelimiter;
import org.daisy.pipeline.braille.common.util.Tuple2;
import org.daisy.pipeline.braille.css.CSSStyledText;
public abstract class AbstractHyphenator extends AbstractTransform implements Hyphenator {
public FullHyphenator asFullHyphenator() throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
public LineBreaker asLineBreaker() throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
/* ================== */
/* UTILS */
/* ================== */
public static abstract class util {
private final static char SHY = '\u00AD';
private final static char ZWSP = '\u200B';
private final static Pattern ON_SPACE_SPLITTER = Pattern.compile("\\s+");
private final static Pattern COMPOUND_WORD_HYPHEN = Pattern.compile("[\\p{L}\\p{N}]-(?=[\\p{L}\\p{N}])");
/**
* {@link FullHyphenator} that dispatches to other hyphenators based on language.
*/
public static abstract class LanguageBasedDispatchingFullHyphenator implements FullHyphenator {
protected abstract Iterable transform(Iterable text, Locale language)
throws NonStandardHyphenationException;
public Iterable transform(Iterable text) throws NonStandardHyphenationException {
// chunk up in chunks of same language
Locale singleLang = null;
boolean mixedLang = false; {
boolean first = false;
for (CSSStyledText t : text)
if (first) {
singleLang = t.getLanguage();
first = false;
} else if (!Objects.equals(singleLang, t.getLanguage())) {
mixedLang = true;
break;
}
}
if (mixedLang) {
List result = new ArrayList<>();
List cur = null;
Locale curLang = null;
for (CSSStyledText t : text) {
Locale lang = t.getLanguage();
if (cur != null && !Objects.equals(curLang, lang)) {
for (CSSStyledText tt : transform(cur, curLang)) result.add(tt);
cur = null;
}
if (cur == null) cur = new ArrayList<>();
cur.add(t);
curLang = lang;
}
if (cur != null)
for (CSSStyledText tt : transform(cur, curLang)) result.add(tt);
return result;
} else {
return transform(text, singleLang);
}
}
}
public static abstract class DefaultFullHyphenator extends LanguageBasedDispatchingFullHyphenator {
private final boolean keepStyle;
protected DefaultFullHyphenator() {
this(false);
}
/**
* @param keepStyle Whether to preserve {@code hyphens} properties in the output of
* {@link #transform(Iterable)}. This is useful when the output of a
* hyphenator needs to be fed into a second hyphenator.
*/
protected DefaultFullHyphenator(boolean keepStyle) {
this.keepStyle = keepStyle;
}
/**
* Whether the length of the array returned by {@link
* #getHyphenationOpportunities(String, Locale)} is based on the number of code points
* in the input or the number of characters.
*/
protected abstract boolean isCodePointAware();
/**
* Whether {@link #getHyphenationOpportunities(String, Locale)} takes into account the
* language
argument.
*/
protected abstract boolean isLanguageAdaptive();
/**
* Get hyphenation opportunities as a byte array (1 = SHY, 2 = ZWSP)
*
* @param textWithoutHyphens text that does not contain SHY and ZWSP characters (and from
* which no SHY and ZWSP characters were extracted either)
*/
protected abstract byte[] getHyphenationOpportunities(String textWithoutHyphens, Locale language)
throws NonStandardHyphenationException;
private final static char US = '\u001F';
private final static Splitter SEGMENT_SPLITTER = Splitter.on(US);
@Override
public Iterable transform(Iterable text) throws NonStandardHyphenationException {
if (!isLanguageAdaptive())
return transform(text, null);
else
return super.transform(text);
}
/**
* Optimized version of {@link #transform(Iterable)} for cases where the input has a
* single segment.
*/
public String transform(String text, SimpleInlineStyle style, Locale language)
throws NonStandardHyphenationException {
Tuple2 t = extractHyphens(text, isCodePointAware(), SHY, ZWSP);
String textWithoutHyphens = t._1;
byte[] manualHyphens = t._2;
// insert zero-width space after hard hyphens ("-" followed and preceded by a letter
// or number)
byte[] hardHyphens; {
int len = textWithoutHyphens.length();
if (len > 1) {
hardHyphens = new byte[len - 1]; {
Matcher m = COMPOUND_WORD_HYPHEN.matcher(textWithoutHyphens);
while (m.find())
hardHyphens[m.start() + 1] = 4; }
if (isCodePointAware())
hardHyphens = extractHyphens(
insertHyphens(textWithoutHyphens, hardHyphens, false, null, null, ZWSP),
true, null, null, ZWSP)._2;
} else
hardHyphens = null;
}
if (hardHyphens != null) {
if (manualHyphens == null)
manualHyphens = hardHyphens;
else
for (int k = 0; k < hardHyphens.length - 1; k++)
manualHyphens[k] |= hardHyphens[k];
}
Hyphens hyphenate = style != null ? style.getProperty("hyphens") : null;
if (hyphenate == Hyphens.AUTO)
return insertHyphens(textWithoutHyphens,
transform(manualHyphens, textWithoutHyphens, language),
isCodePointAware(),
SHY, ZWSP, ZWSP)
.replace("" + ZWSP + ZWSP, "" + ZWSP);
else if (hyphenate == null || hyphenate == Hyphens.MANUAL)
return insertHyphens(textWithoutHyphens, manualHyphens, isCodePointAware(), SHY, ZWSP, ZWSP)
.replace("" + ZWSP + ZWSP, "" + ZWSP);
else // hyphenate == Hyphens.NONE
// FIXME: better would be to only remove SHY and ZWSP within words, but the
// issue with this is that what constitutes a "word" is language dependent
// and finding words requires NLP
return insertHyphens(textWithoutHyphens, manualHyphens, isCodePointAware(), null, null, ZWSP);
}
protected Iterable transform(Iterable text, Locale language) throws NonStandardHyphenationException {
List result = new ArrayList<>();
List hyphenate = new ArrayList<>();
boolean someHyphenate = false;
for (CSSStyledText t : text) {
if (!keepStyle)
t = t.clone();
SimpleInlineStyle style = t.getStyle();
CSSProperty h = style != null ? style.getProperty("hyphens") : null;
if (h == null) h = Hyphens.MANUAL;
hyphenate.add(h);
if (h == Hyphens.AUTO)
someHyphenate = true;
if (!keepStyle && style != null)
style.removeProperty("hyphens");
result.add(t);
}
if (result.size() == 0)
return result;
Tuple2 t = extractHyphens(
join(Iterables.transform(text, CSSStyledText::getText), US), isCodePointAware(), SHY, ZWSP);
List textWithoutHyphens = SEGMENT_SPLITTER.splitToList(t._1);
t = extractHyphens(t._2, t._1, isCodePointAware(), null, null, null, US);
String joinedTextWithoutHyphens = t._1;
byte[] manualHyphensAndSegmentBoundaries = t._2;
// insert zero-width space after hard hyphens ("-" followed and preceded by a letter
// or number)
byte[] hardHyphens; {
int len = joinedTextWithoutHyphens.length();
if (len > 1) {
hardHyphens = new byte[len - 1]; {
Matcher m = COMPOUND_WORD_HYPHEN.matcher(joinedTextWithoutHyphens);
while (m.find())
hardHyphens[m.start() + 1] = 4; }
if (isCodePointAware())
hardHyphens = extractHyphens(
insertHyphens(joinedTextWithoutHyphens, hardHyphens, false, null, null, ZWSP),
true, null, null, ZWSP)._2;
} else
hardHyphens = null;
}
if (hardHyphens != null) {
if (manualHyphensAndSegmentBoundaries == null)
manualHyphensAndSegmentBoundaries = hardHyphens;
else
for (int k = 0; k < hardHyphens.length - 1; k++)
manualHyphensAndSegmentBoundaries[k] |= hardHyphens[k];
}
byte[] hyphensAndSegmentBoundaries = someHyphenate
? transform(manualHyphensAndSegmentBoundaries, joinedTextWithoutHyphens, language)
: manualHyphensAndSegmentBoundaries;
List textWithHyphensAuto =
SEGMENT_SPLITTER.splitToList(
insertHyphens(
joinedTextWithoutHyphens, hyphensAndSegmentBoundaries, isCodePointAware(), SHY, ZWSP, ZWSP, US)
.replace("" + ZWSP + ZWSP, "" + ZWSP));
List textWithHyphensManual =
SEGMENT_SPLITTER.splitToList(
insertHyphens(
joinedTextWithoutHyphens, manualHyphensAndSegmentBoundaries, isCodePointAware(), SHY, ZWSP, ZWSP, US)
.replace("" + ZWSP + ZWSP, "" + ZWSP));
List textWithHyphensNone =
SEGMENT_SPLITTER.splitToList(
insertHyphens(
joinedTextWithoutHyphens, manualHyphensAndSegmentBoundaries, isCodePointAware(), null, null, ZWSP, US));
int j = 0;
for (int i = 0; i < result.size(); i++) {
if (textWithoutHyphens.get(i).isEmpty())
result.set(i, new CSSStyledText("",
result.get(i).getStyle(),
result.get(i).getLanguage(),
result.get(i).getTextAttributes()));
else {
CSSProperty h = hyphenate.get(i);
if (h == Hyphens.AUTO)
result.set(i, new CSSStyledText(textWithHyphensAuto.get(j++),
result.get(i).getStyle(),
result.get(i).getLanguage(),
result.get(i).getTextAttributes()));
else if (h == Hyphens.MANUAL)
result.set(i, new CSSStyledText(textWithHyphensManual.get(j++),
result.get(i).getStyle(),
result.get(i).getLanguage(),
result.get(i).getTextAttributes()));
else // h == Hyphens.NONE
// FIXME: better would be to only remove SHY and ZWSP within words, but the
// issue with this is that what constitutes a "word" is language dependent
// and finding words requires NLP
result.set(i, new CSSStyledText(textWithHyphensNone.get(j++),
result.get(i).getStyle(),
result.get(i).getLanguage(),
result.get(i).getTextAttributes()));
}
}
return result;
}
/**
* @param manualHyphens SHY, ZWSP and US characters that were extracted from the original
* text, which resulted in textWithoutHyphens
* @param textWithoutHyphens text without SHY, ZWSP and US characters
*/
protected final byte[] transform(byte[] manualHyphens, String textWithoutHyphens, Locale language)
throws NonStandardHyphenationException {
if (textWithoutHyphens.length() == 0)
return manualHyphens;
boolean hasManualHyphens = false; {
if (manualHyphens != null)
for (byte b : manualHyphens)
if (b == (byte)1 || b == (byte)2) {
hasManualHyphens = true;
break; }}
if (hasManualHyphens) {
// input contains SHY or ZWSP; hyphenate only the words without SHY or ZWSP
// FIXME: for simplicity a "word" means a sequence of non white space here, but
// a better definition is needed
byte[] hyphens = Arrays.copyOf(manualHyphens, manualHyphens.length);
boolean word = true;
int pos = 0;
for (String segment : splitInclDelimiter(textWithoutHyphens, ON_SPACE_SPLITTER)) {
int len = isCodePointAware()
? segment.codePointCount(0, segment.length())
: segment.length();
if (word && len > 0) {
boolean wordHasManualHyphens = false; {
for (int k = 0; k < len - 1; k++)
if (hyphens[pos + k] != 0) {
wordHasManualHyphens = true;
break; }}
if (!wordHasManualHyphens) {
byte[] wordHyphens = getHyphenationOpportunities(segment, language);
if (wordHyphens != null)
for (int k = 0; k < len - 1; k++)
hyphens[pos + k] |= wordHyphens[k];
}
}
pos += len;
word = !word;
}
return hyphens;
} else {
byte[] hyphens = getHyphenationOpportunities(textWithoutHyphens, language);
if (manualHyphens != null) {
if (hyphens == null)
hyphens = manualHyphens;
else
for (int k = 0; k < hyphens.length; k++)
hyphens[k] |= manualHyphens[k]; }
return hyphens;
}
}
}
/**
* {@link FullHyphenator} that does not provide any hyphenation opportunities (only soft
* wrap opportunities outside words).
*/
public static class NoHyphenator extends DefaultFullHyphenator {
public Iterable transform(java.lang.Iterable text) {
if (COMPOUND_WORD_HYPHEN.matcher(
join(com.google.common.collect.Iterables.transform(text, CSSStyledText::getText))).find())
return super.transform(text);
else
return text;
}
protected boolean isCodePointAware() { return true; }
protected boolean isLanguageAdaptive() { return false; }
protected byte[] getHyphenationOpportunities(String textWithoutManualHyphens, Locale language) {
return null;
}
}
// TODO: caching?
public static abstract class DefaultLineBreaker implements LineBreaker {
/**
* Break a sequence of non white space characters into two parts.
*
* This method is responsible for taking into account SHY or ZWSP within the input. SHY
* or ZWSP within an actual word (morphological unit) should be used as break
* opportunities without any additional hyphenation.
*
* This method should be overriden. The default behavior is that a word is only broken
* at SHY and ZWSP and after hard hyphens, or if the force
argument is
* true
. The language
argument is ignored.
*/
protected Break breakWord(String word, Locale _language, int limit, boolean force) {
if (word.length() <= limit)
return new Break(word, limit, false);
// break at SHY or ZWSP
Tuple2 t = extractHyphens(word, true, ZWSP, SHY);
word = t._1;
if (word.length() <= limit)
return new Break(word, limit, false);
byte[] manualHyphens = t._2;
for (int k = manualHyphens.length - 1; k >= 0; k--)
if (manualHyphens[k] != 0) {
int br = word.offsetByCodePoints(0, k + 1);
String next = word.substring(0, br);
if (next.length() <= limit) {
String remainder = word.substring(br);
return new Break(next + remainder, next.length(), manualHyphens[k] == 2);
}
}
// break after hard hyphens
Matcher m = COMPOUND_WORD_HYPHEN.matcher(word);
if (m.find()) {
int len = m.start() + 2;
if (len <= limit)
return new Break(word, len, false);
}
if (force)
return new Break(word, limit, false);
return new Break(word, 0, false);
}
protected static class Break {
private final String text;
private final int position;
private final boolean hyphen;
public Break(String text, int position, boolean hyphen) {
this.text = text;
this.position = position;
this.hyphen = hyphen;
}
private String firstLine() {
return text.substring(0, position);
}
private String secondLine() {
return text.substring(position);
}
@Override
public String toString() {
return firstLine() + "=" + secondLine();
}
}
public LineIterator transform(final String text, final Locale language) {
return new LineIterator() {
String remainder = text;
String remainderAtMark = text;
boolean lineHasHyphen = false;
boolean lineHasHyphenAtMark = false;
boolean started = false;
boolean startedAtMark = false;
public String nextLine(int limit, boolean force) {
return nextLine(limit, force, true);
}
public String nextLine(int limit, boolean force, boolean allowHyphens) {
started = true;
String line = "";
lineHasHyphen = false;
if (remainder != null) {
if (remainder.length() <= limit) {
line += remainder;
remainder = null; }
else {
String r = "";
int available = limit;
boolean word = true;
for (String segment : splitInclDelimiter(remainder, ON_SPACE_SPLITTER)) {
if (available == 0)
r += segment;
else if (segment.length() <= available) {
line += segment;
available -= segment.length();
word = !word; }
else if (word && allowHyphens) {
// segment may contain SHY or ZWSP; breakWord() is responsible for correctly taking them into account
Break brokenWord = breakWord(segment, language, available, force && (available == limit));
line += brokenWord.firstLine();
lineHasHyphen = brokenWord.hyphen;
r += brokenWord.secondLine();
available = 0; }
else {
r += segment;
available = 0; }}
remainder = r.isEmpty() ? null : r; }}
return line;
}
public boolean hasNext() {
return remainder != null;
}
public boolean lineHasHyphen() {
if (!started)
throw new RuntimeException("nextLine must be called first.");
return lineHasHyphen;
}
public String remainder() {
return remainder;
}
public void mark() {
remainderAtMark = remainder;
lineHasHyphenAtMark = lineHasHyphen;
startedAtMark = started;
}
public void reset() {
remainder = remainderAtMark;
lineHasHyphen = lineHasHyphenAtMark;
started = startedAtMark;
}
};
}
}
}
}