All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.daisy.pipeline.braille.common.HyphenatorWithExceptions Maven / Gradle / Ivy

The newest version!
package org.daisy.pipeline.braille.common;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.TreeMap;

import com.google.common.base.MoreObjects;
import com.google.common.base.MoreObjects.ToStringHelper;
import com.google.common.collect.Iterables;

import org.daisy.braille.css.BrailleCSSProperty.Hyphens;
import org.daisy.braille.css.SimpleInlineStyle;
import org.daisy.pipeline.braille.common.AbstractHyphenator.util.DefaultFullHyphenator;
import org.daisy.pipeline.braille.common.AbstractHyphenator.util.DefaultLineBreaker;
import org.daisy.pipeline.braille.common.util.Strings;
import org.daisy.pipeline.braille.common.util.Tuple2;
import org.daisy.pipeline.braille.css.CSSStyledText;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HyphenatorWithExceptions extends AbstractHyphenator {

	private final Hyphenator hyphenator;
	private final Reader exceptionsFile;
	private final Map exceptionWords;

	public HyphenatorWithExceptions(Hyphenator hyphenator, Reader exceptionsFile) throws IOException {
		this.hyphenator = hyphenator;
		this.exceptionsFile = exceptionsFile;
		exceptionWords = compileExceptionsFile(exceptionsFile);
	}

	private final static char SHY = '\u00AD';
	private final static char ZWSP = '\u200B';
	// for simplicity a word means a sequence of letters
	private final static Pattern WORD = Pattern.compile("\\p{L}[\\p{L}" + SHY + ZWSP + "]*\\p{L}");
	private final static Pattern EXCEPTION_WORD = Pattern.compile("\\p{L}[\\p{L}-]*\\p{L}");

	private FullHyphenator exceptionProcessor = null;
	private FullHyphenator fullHyphenator = null;
	private LineBreaker lineBreaker = null;

	@Override
	public FullHyphenator asFullHyphenator() {
		if (fullHyphenator == null) {
			FullHyphenator backingHyphenator = hyphenator.asFullHyphenator();
			if (exceptionProcessor == null)
				exceptionProcessor = new DefaultFullHyphenator(true) { // to preserve `hyphens: auto' in the output
						@Override
						protected boolean isCodePointAware() {
							return true;
						}
						@Override
						protected boolean isLanguageAdaptive() {
							return false;
						}
						@Override
						protected byte[] getHyphenationOpportunities(String textWithoutHyphens, Locale _language)
								throws NonStandardHyphenationException {
							byte[] result = new byte[textWithoutHyphens.codePointCount(0, textWithoutHyphens.length()) - 1];
							int i = 0;
							boolean isWord = false;
							for (String w : Strings.splitInclDelimiter(textWithoutHyphens, WORD)) {
								int l = w.codePointCount(0, w.length());
								if (i > 0 && l > 0)
									result[i++] = 0;
								if (isWord) {
									byte[] r = exceptionWords.get(w);
									if (r != null)
										for (int j = 0; j < l - 1; j++)
											result[i++] = r[j];
									else
										for (int j = 0; j < l - 1; j++)
											result[i++] = 0;
								} else {
									for (int j = 0; j < l - 1; j++)
										result[i++] = 0;
								}
								isWord = !isWord;
							}
							return result;
						}
				};
			fullHyphenator = new FullHyphenator() {
					@Override
					public Iterable transform(Iterable text) throws NonStandardHyphenationException {
						if (Iterables.any(text, t -> t.getStyle() != null && t.getStyle().getProperty("hyphens") == Hyphens.AUTO)) {
							// check whether text contains words that are present in the dictionary
							// (words that already contain SHY or ZWSP are not counted)
							boolean containsExceptionWords = false; {
								boolean isWord = false;
								for (String w : Strings.splitInclDelimiter(Strings.join(Iterables.transform(text, CSSStyledText::getText)), WORD)) {
									if (isWord && exceptionWords.containsKey(w)) {
										containsExceptionWords = true;
										break; }
									isWord = !isWord; }}
							if (containsExceptionWords)
								// insert soft hyphens into the words (they will take priority over automatic hyphenation)
								text = exceptionProcessor.transform(text);
						}
						return backingHyphenator.transform(text);
					}
					@Override
					public String transform(String text, SimpleInlineStyle style, Locale language)
							throws NonStandardHyphenationException {
						if (style != null && style.getProperty("hyphens") == Hyphens.AUTO) {
							// check whether text contains words that are present in the dictionary
							// (words that already contain SHY or ZWSP are not counted)
							boolean containsExceptionWords = false; {
								boolean isWord = false;
								for (String w : Strings.splitInclDelimiter(text, WORD)) {
									if (isWord && exceptionWords.containsKey(w)) {
										containsExceptionWords = true;
										break; }
									isWord = !isWord; }}
							if (containsExceptionWords)
								// insert soft hyphens into the words (they will take priority over automatic hyphenation)
								text = exceptionProcessor.transform(text, style, language);
						}
						return backingHyphenator.transform(text, style, language);
					}
					@Override
					public String toString() {
						return HyphenatorWithExceptions.this.toString();
					}
				};
		}
		return fullHyphenator;
	}

	/**
	 * @throws UnsupportedOperationException if the underlying hyphenator's {@link asLineBreaker()} method throws an
	 *                                       {@link UnsupportedOperationException}
	 */
	public LineBreaker asLineBreaker() throws UnsupportedOperationException {
		if (lineBreaker == null) {
			LineBreaker backingHyphenator = hyphenator.asLineBreaker(); // if this throws a UnsupportedOperationException,
			                                                            // it means that hyphenator.asFullHyphenator(), and
			                                                            // therefore asFullHyphenator(), will never throw a
			                                                            // NonStandardHyphenationException, so it is
			                                                            // allowed for asLineBreaker() to throw a
			                                                            // UnsupportedOperationException
			// LineBreaker that breaks words at SHY at ZWSP. It is expected that ZWSP have already been inserted after
			// hard hyphens.
			LineBreaker standardLineBreaker = new DefaultLineBreaker() {};
			SimpleInlineStyle HYPHENS_AUTO = new SimpleInlineStyle("hyphens: auto");
			lineBreaker = new DefaultLineBreaker() {
					@Override
					public LineIterator transform(String text, Locale language) {
						// try to perform full hyphenation
						try {
							return standardLineBreaker.transform(
								HyphenatorWithExceptions.this.asFullHyphenator().transform(text, HYPHENS_AUTO, language),
								null);
						} catch (NonStandardHyphenationException e) {
							// try to process all exception words in the input before applying the main hyphenator
							// (soft hyphens inserted by the first pass will take priority)
							if (backingHyphenator != null) {
								// check whether text contains words that are present in the dictionary
								// (words that already contain SHY or ZWSP are not counted)
								boolean containsExceptionWords = false; {
									boolean isWord = false;
									for (String w : Strings.splitInclDelimiter(text, WORD)) {
										if (isWord && exceptionWords.containsKey(w)) {
											containsExceptionWords = true;
											break; }
										isWord = !isWord; }}
								if (containsExceptionWords)
									try {
										text = exceptionProcessor.transform(text, HYPHENS_AUTO, null);
									} catch (NonStandardHyphenationException ee) {
										throw new IllegalStateException("coding error");
									}
								return backingHyphenator.transform(text, language);
							} else {
								// fall back to word-by-word processing (see breakWord())
								// first handle compound word hyphens
								text = text.replaceAll("(?<=[\\p{L}\\p{N}])-(?=[\\p{L}\\p{N}])", "-" + ZWSP);
								return super.transform(text, language);
							}
						}
					}
					@Override
					protected Break breakWord(String word, Locale language, int limit, boolean force) {
						if (word.length() <= limit)
							return new Break(word, limit, false);
						String line = "";
						boolean lineHasHyphen = false;
						String remainder = "";
						// further break up sequence of non white space characters
						boolean isWord = false;
						for (String w : Strings.splitInclDelimiter(word, WORD)) {
							int available = limit - line.length() - remainder.length();
							if (available <= 0) {
								if (line.isEmpty()) {
									remainder = word;
									break;
								}
								remainder += w;
							// check if segment contains SHY or ZWSP or hard hyphen
							} else if (w.indexOf(SHY) >= 0 || w.indexOf(ZWSP) >= 0) {
								LineIterator lines = standardLineBreaker.transform(String.format("x%sx", w), null);
								String next = lines.nextLine(available + 1, false);
								if (!next.isEmpty() && lines.hasNext()) {
									line += (remainder + next.substring(1));
									lineHasHyphen = lines.lineHasHyphen();
									remainder = lines.remainder();
									remainder = remainder.substring(0, remainder.length() - 1);
								} else {
									remainder += (next + lines.remainder());
									remainder = remainder.substring(1, remainder.length() - 1);
								}
							} else if (isWord) {
								LineIterator lines = null; {
									String fullyHyphenated = null;
									// check whether word is present in the dictionary
									if (exceptionWords.containsKey(w)) {
										// try full hyphenation of word
										try {
											fullyHyphenated = exceptionProcessor.transform(w, HYPHENS_AUTO, null);
										} catch (NonStandardHyphenationException e) {
											throw new IllegalStateException("coding error");
										}
									} else {
										// use main hyphenator if word does not contain SHY or ZWSP
										if (backingHyphenator != null)
											lines = backingHyphenator.transform(word, language);
										else {
											try {
												fullyHyphenated = hyphenator.asFullHyphenator().transform(w, HYPHENS_AUTO, null);
											} catch (NonStandardHyphenationException e) {
												// if hyphenator.asLineBreaker() is null, hyphenator.asFullHyphenator() must
												// never throw a NonStandardHyphenationException
												throw new IllegalStateException("coding error");
											}
										}
									}
									if (lines == null) {
										if (fullyHyphenated == null)
											throw new IllegalStateException("coding error");
										lines = standardLineBreaker.transform(fullyHyphenated, null);
									}
								}
								String next = lines.nextLine(available, false);
								if (!next.isEmpty() && lines.hasNext()) {
									line += (remainder + next);
									lineHasHyphen = lines.lineHasHyphen();
									remainder = lines.remainder();
								} else {
									remainder += (next + lines.remainder());
								}
							} else {
								remainder += w;
							}
							isWord = !isWord;
						}
						// if a break point was found, return it
						if (!line.isEmpty())
							return new Break(line + remainder, line.length(), lineHasHyphen);
						if (force)
							// force break
							return new Break(word, limit, false);
						return new Break(word, 0, false);
					}
					@Override
					public String toString() {
						return HyphenatorWithExceptions.this.toString();
					}
				};
		}
		return lineBreaker;
	}

	@Override
	public ToStringHelper toStringHelper() {
		return MoreObjects.toStringHelper("HyphenatorWithExceptions")
			.add("hyphenator", hyphenator)
			.add("exceptions", exceptionsFile);
	}

	private static Map compileExceptionsFile(Reader file) throws IOException {
		Map map = new TreeMap<>();
		try (BufferedReader reader = new BufferedReader(file)) {
			String line;
			while ((line = reader.readLine()) != null) {
				if (!EXCEPTION_WORD.matcher(line).matches()) {
					logger.warn("Invalid word in exceptions list: " + line);
					continue;
				}
				Tuple2 t = Strings.extractHyphens(line, true, '-');
				map.put(t._1, t._2);
			}
		}
		return map;
	}

	private static final Logger logger = LoggerFactory.getLogger(HyphenatorWithExceptions.class);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy