net.sf.okapi.lib.segmentation.SRXSegmenter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of okapi-lib-segmentation Show documentation
There is a newer version: 1.47.0
Show newest version
/*===========================================================================
  Copyright (C) 2008-2012 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.lib.segmentation;

import static net.sf.okapi.common.resource.TextFragment.Marker.CLOSING;
import static net.sf.okapi.common.resource.TextFragment.Marker.ISOLATED;
import static net.sf.okapi.common.resource.TextFragment.Marker.OPENING;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;

import net.sf.okapi.common.ISegmenter;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Range;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextFragment.Marker;
import net.sf.okapi.common.resource.TextUnitUtil;

/**
 * Implements the {@link ISegmenter} interface for SRX rules.
 */
public class SRXSegmenter implements ISegmenter {

	/**
	 * The isolated code replacement text.
	 */
	private static final String ISOLATED_CODE_REPLACEMENT_TEXT = " ";

	/**
	 * The code marker length.
	 */
	private static final int CODE_MARKER_LENGTH = 2;

	private final Logger LOGGER = LoggerFactory.getLogger(getClass());
	
	private boolean segmentSubFlows;
	private boolean cascade;
	private boolean includeStartCodes;
	private boolean includeEndCodes;
	private boolean includeIsolatedCodes;
	private LocaleId currentLanguageCode;
	private boolean oneSegmentIncludesAll; // Extension
	private boolean trimLeadingWS; // Extension
	private boolean trimTrailingWS; // Extension
	private boolean useJavaRegex; // Deprecated Extension, always true.
	private boolean useIcu4JBreakRules = false;
	private boolean trimCodes; // Extension
	private boolean treatIsolatedCodesAsWhitespace; // Extension
	private ArrayList rules;
	private Pattern maskRule; // Extension
	private TreeMap splits;
	private List finalSplits;
	private ArrayList starts;
	private ArrayList ends;
	private BreakIterator icu4jBreakIterator;

	/**
	 * Creates a new SRXSegmenter object.
	 */
	public SRXSegmenter () {
		reset();
	}

	@Override
	public void reset () {
		currentLanguageCode = null;
		rules = new ArrayList();
		maskRule = null;
		splits = null;
		segmentSubFlows = true; // SRX default
		cascade = false; // There is no SRX default for this
		includeStartCodes = false; // SRX default
		includeEndCodes = true; // SRX default
		includeIsolatedCodes = false; // SRX default
		oneSegmentIncludesAll = false; // Extension
		trimLeadingWS = false; // Extension IN TEST (was true for StringInfo)
		trimTrailingWS = false; // Extension IN TEST (was true for StringInfo)
		useJavaRegex = true; // Deprecated Extension
		trimCodes = false; // Extension IN TEST (was false for StringInfo) NOT USED for now
		treatIsolatedCodesAsWhitespace = false;
		useIcu4JBreakRules = false; // Extension
	}

	/**
	 * Sets the options for this segmenter.
	 * @param segmentSubFlows true to segment sub-flows, false to no segment them.
	 * @param includeStartCodes true to include start codes just before a break in the 'left' segment,
	 * false to put them in the next segment. 
	 * @param includeEndCodes  true to include end codes just before a break in the 'left' segment,
	 * false to put them in the next segment.
	 * @param includeIsolatedCodes true to include isolated codes just before a break in the 'left' segment,
	 * false to put them in the next segment.
	 * @param oneSegmentIncludesAll true to include everything in segments that are alone.
	 * @param trimLeadingWS true to trim leading white-spaces from the segments, false to keep them.
	 * @param trimTrailingWS true to trim trailing white-spaces from the segments, false to keep them.
	 * @param useJavaRegex true if the rules are for the Java regular expression engine, false if they are for ICU. 
	 * @param treatIsolatedCodesAsWhitespace if true then the isolated code markers in codedText get converted
	 * to spaces, so that they don't get in the way of the rules. If false, the codes are simply removed.
	 */
	public void setOptions (boolean segmentSubFlows,
		boolean includeStartCodes,
		boolean includeEndCodes,
		boolean includeIsolatedCodes,
		boolean oneSegmentIncludesAll,
		boolean trimLeadingWS,
		boolean trimTrailingWS,
		boolean useJavaRegex,
		boolean useIcu4JBreakRules,
		boolean treatIsolatedCodesAsWhitespace)
	{
		this.segmentSubFlows = segmentSubFlows;
		this.includeStartCodes = includeStartCodes;
		this.includeEndCodes = includeEndCodes;
		this.includeIsolatedCodes = includeIsolatedCodes;
		this.oneSegmentIncludesAll = oneSegmentIncludesAll;
		this.trimLeadingWS = trimLeadingWS;
		this.trimTrailingWS = trimTrailingWS;
		this.useJavaRegex = useJavaRegex;
		this.useIcu4JBreakRules = useIcu4JBreakRules;
		this.treatIsolatedCodesAsWhitespace = treatIsolatedCodesAsWhitespace;
		if (!useJavaRegex) LOGGER.error("Use of ICU regex has been removed.");
	}
	
	@Override
	public void setOptions (boolean segmentSubFlows,
		boolean includeStartCodes,
		boolean includeEndCodes,
		boolean includeIsolatedCodes,
		boolean oneSegmentIncludesAll,
		boolean trimLeadingWS,
		boolean trimTrailingWS)
	{
		this.segmentSubFlows = segmentSubFlows;
		this.includeStartCodes = includeStartCodes;
		this.includeEndCodes = includeEndCodes;
		this.includeIsolatedCodes = includeIsolatedCodes;
		this.oneSegmentIncludesAll = oneSegmentIncludesAll;
		this.trimLeadingWS = trimLeadingWS;
		this.trimTrailingWS = trimTrailingWS;		
	}
	
	@Override
	public boolean oneSegmentIncludesAll () {
		return oneSegmentIncludesAll;
	}

	@Override
	public boolean segmentSubFlows () {
		return segmentSubFlows;
	}
	
	/**
	 * Indicates if cascading must be applied when selecting the rules for 
	 * a given language pattern.
	 * @return true if cascading must be applied, false otherwise.
	 */
	public boolean cascade () {
		return cascade;
	}
	
	@Override
	public boolean trimLeadingWhitespaces () {
		return trimLeadingWS;
	}
	
	@Override
	public boolean trimTrailingWhitespaces () {
		return trimTrailingWS;
	}
	
	/**
	 * Indicates if this document has rules that are defined for the Java regular expression engine (vs ICU).
	 * @return true if the rules are for the Java regular expression engine, false if they are for ICU.
	 */
	public boolean useJavaRegex () {
		return useJavaRegex;
	}

	@Override
	public boolean treatIsolatedCodesAsWhitespace () {
		return treatIsolatedCodesAsWhitespace;
	}

	/**
	 * Sets the indicator that tells if this document has rules that are defined for the Java regular expression engine (vs ICU).
	 * @param useJavaRegex true if the rules should be treated as Java regular expression, false for ICU.
	 */
	public void setUseJavaRegex (boolean useJavaRegex) {
		this.useJavaRegex = useJavaRegex;
		if (!useJavaRegex) LOGGER.warn("Use of ICU regex is deprecated and may be removed in the future."); 
	}
	
	@Override
	public boolean includeStartCodes () {
		return includeStartCodes;
	}
	
	@Override
	public boolean includeEndCodes () {
		return includeEndCodes;
	}
	
	@Override
	public boolean includeIsolatedCodes () {
		return includeIsolatedCodes;
	}
	
	@Override
	public int computeSegments (String text) {
		TextContainer tmp = new TextContainer(text);
		return computeSegments(tmp);
	}

	@Override
	public int computeSegments (TextContainer container) {
		if ( currentLanguageCode == null ) {
			// Need to call selectLanguageRule()
			throw new SegmentationRuleException("No language defined for the segmenter.");
		}
		
		// Do we have codes?
		// Avoid to create an un-segmented copy if we can
		boolean hasCode;
		if ( container.contentIsOneSegment() ) hasCode = container.getSegments().getFirstContent().hasCode();
		else hasCode = container.getUnSegmentedContentCopy().hasCode();
		
		// Set the flag for trimming or not the in-line codes
		boolean isSCWS = trimCodes && !includeStartCodes;
		boolean isECWS = trimCodes && !includeEndCodes;
		boolean isICWS = trimCodes && !includeIsolatedCodes;

		// Build the list of split positions
		// Get the coded text for the whole content
		String codedText = container.getCodedText();
		List origCodePositions = storeOriginalCodePositions(codedText);	
		List codePositions = storeCodePositions(codedText);

		// Remove code markers from codedText not to get in the way of the rules
		codedText = treatIsolatedCodesAsWhitespace ?
					TextUnitUtil.removeAndReplaceCodes(codedText, ISOLATED_CODE_REPLACEMENT_TEXT) :
					TextUnitUtil.removeCodes(codedText);

		// ICU4J rules are generated for each segment, must add with the normal SRX rules
		ArrayList combinedRules = null;
		if (useIcu4JBreakRules) {
			combinedRules = new ArrayList<>();
			combinedRules.addAll(rules);
			// ICU4J break rules are always added last as we want previous "exception" SRX rules to override them if found.	
			combinedRules.addAll(getIcu4jBreakRules(codedText));
		} else {
			combinedRules = rules;
		}
		
		splits = new TreeMap<>();
		Matcher m;
		for ( CompiledRule rule : combinedRules ) {			
			m = rule.pattern.matcher(codedText);
			// FIXME: I think transparentbounds is what we want so that regex can peek 
			// behind and ahead to better give context outside the matching region. 			
			m.useTransparentBounds(true);
			int start = 0;
			int prevStart = -1;
			while (( start != prevStart ) && m.find(start) ) {
				int n = m.start()+m.group(1).length();

				// Set next start
				prevStart = start; // Comparing with previous start avoid infinite loop for non-capturing patterns
				start = n; // We search starting at each character (to make sure we cover the previous match too)
				
				// Match the end
				if ( n > codedText.length() ) continue;
				// Already a match: Per SRX algorithm, we use the first one only
				// see http://www.gala-global.org/oscarStandards/srx/srx20.html#Struct_classdefinitions
				if ( splits.containsKey(n) ) continue;
				
				// Else add a split marker
				splits.put(n, rule.isBreak);
			}
		}

		codedText = container.getCodedText(); // restore codedText after word breaks

		// Adjust split positions minding the removed original codes
		TreeMap oldSplits = splits;
		splits = new TreeMap<>();
		
		for (Integer pos : oldSplits.keySet()) {
			int newPos = recalcPos(codedText, pos, codePositions, origCodePositions);
			splits.put(newPos, oldSplits.get(pos));
		}
		
		// Set the additional split positions for mask-rules
		if ( maskRule != null ) {
			m = maskRule.matcher(codedText);
			while ( m.find() ) {
				// Remove any existing marker inside the range
				for ( int n=m.start(); n 0 ) splits.put(m.start(), true);
				splits.put(m.end(), true);
			}
		}
		
		// Adjust the split positions for in-line codes inclusion/exclusion options
		// And create the list of final splits at the same time
		finalSplits = new ArrayList();
		// Do this only if we have in-line codes
		if ( hasCode ) {
			
			// setup start, end and isolated code settings as an EnumSet to make it easier to check
			// include code options
			EnumSet includeCodeSettings = EnumSet.noneOf(TextFragment.Marker.class);
			if (includeStartCodes) {
				includeCodeSettings.add(OPENING);
			}			
			if (includeEndCodes) {
				includeCodeSettings.add(CLOSING);
			}
			if (includeIsolatedCodes) {
				includeCodeSettings.add(ISOLATED);
			}
						
			// All breaks are before codes, as we restore a code at its original pos, and if 
			// there's a break at that pos, the code will always find itself after the break 
			for (int pos : splits.keySet()) {
				if (!splits.get(pos)) continue; // Skip non-break positions
				// FIXME: Out of bounds error should never happen, but we are seeing it for Chinese
				// this fix prevents a index out of bounds exception, but may be masking a
				// bigger problem.
				if (pos >= codedText.length()) continue;

				// keep processing any consecutive code that has include = true
				// we stop when we hit a code with include = false
				Marker codeMarkerType = Marker.asEnum(codedText.charAt(pos));
				switch (codeMarkerType) {
					case OPENING:
					case CLOSING:
					case ISOLATED:
						// if include code setting = true for this code
						// Move pos forward adding any codes that also have their settings = true
						// stop when we hit a code that has include = false
						// otherwise (include code setting = false) leave the position as-is (in the following segment)
						if (includeCodeSettings.contains(codeMarkerType)) {
							do {
								pos += CODE_MARKER_LENGTH;
							}
							while (pos < codedText.length() - 1 && includeCodeSettings.contains(Marker.asEnum(codedText.charAt(pos))));
						}
						break;
					default:
						break;
				}
				// Store the updated position
				finalSplits.add(pos);
			}
		}
		else { // Just copy the real splits
			for ( int pos : splits.keySet() ) {
				if ( splits.get(pos) ) finalSplits.add(pos);
			}
		}
		
		// Now build the lists of start and end of each segment
		// but trim them of any white-spaces.
		// Deal also with including or not the in-line codes.
		starts = new ArrayList();
		ends = new ArrayList();
		int textEnd;
		int textStart = 0;
		int trimmedTextStart;
		for ( int pos : finalSplits ) {
			// FIXME: This condition should never happen, but we are seeing it for Chinese
			// this fix prevents a index out of bounds exception, but may be masking a
			// bigger problem.
			if (pos >= codedText.length()) continue;
			// Trim white-spaces and codes as required at the front
			trimmedTextStart = TextFragment.indexOfFirstNonWhitespace(codedText,
				textStart, pos-1, isSCWS, isECWS, isICWS, trimLeadingWS);
			if ( trimmedTextStart == -1 ) { //pos-1 ) {
				// Only spaces in the segment: Continue with the next position
				continue;
			}
			if ( trimLeadingWS || trimCodes ) textStart = trimmedTextStart;
			// Trim white-spaces and codes as required at the back
			if ( trimTrailingWS || trimCodes ) {
				textEnd = TextFragment.indexOfLastNonWhitespace(codedText,
					pos-1, 0, isSCWS, isECWS, isICWS, trimTrailingWS);
			}
			else textEnd = pos-1;
			if ( textEnd >= textStart ) { // Only if there is something // was > only
				if ( textEnd < pos ) textEnd++; // Adjust for +1 position
				starts.add(textStart);
				ends.add(textEnd);
			}
			textStart = pos;
		}
		// Last one
		int lastPos = codedText.length();
		if ( textStart < lastPos ) {
			// Trim white-spaces and codes as required at the front
			trimmedTextStart = TextFragment.indexOfFirstNonWhitespace(codedText, textStart,
				lastPos-1, isSCWS, isECWS, isICWS, trimLeadingWS);
			if ( trimLeadingWS || trimCodes  ) {
				if ( trimmedTextStart != -1 ) textStart = trimmedTextStart;
			}
			if (( trimmedTextStart != -1 ) && ( trimmedTextStart < lastPos )) {
				// Trim white-spaces and code as required at the back
				if ( trimTrailingWS || trimCodes ) {
					textEnd = TextFragment.indexOfLastNonWhitespace(codedText, lastPos-1,
						textStart, isSCWS, isECWS, isICWS, trimTrailingWS);
				}
				else textEnd = lastPos-1;
				if ( textEnd >= textStart ) { // Only if there is something
					if ( textEnd < lastPos ) textEnd++; // Adjust for +1 position
					starts.add(textStart);
					ends.add(textEnd);
				}
			}
		}

		// Check for single-segment text case
		if (( starts.size() == 1 ) && ( oneSegmentIncludesAll )) {
			starts.set(0, 0);
			ends.clear(); // lastPos is added just after
		}

		// Add an extra value in ends to hold the total length of the coded text
		// to avoid having to re-create it when segmenting.
		ends.add(lastPos);
		
		// Return the number of segment found
		// (ends contains one extra value, so make sure to use starts for this)
		return starts.size();
	}

	// Convert ICU4J break positions to rules so they work as any other SRX rule
	private Collection getIcu4jBreakRules(String text) {
		LinkedList rules = new LinkedList<>(); 	
		// icu4jBreakIterator created when locale is set (setLanguage)
		icu4jBreakIterator.setText(text);
		// only needed to call generateRuleRegex method
		SRXDocument d = new SRXDocument();
		String pattern = null;
		for (int boundary = icu4jBreakIterator.next(); boundary != BreakIterator.DONE; boundary = icu4jBreakIterator.next()) {
			// ICU always puts a boundary at the end of the string: skip it
			if (boundary == text.length())
				continue;
			
			// Boundary is "the zero-based index of the character following the boundary"
            // (see http://userguide.icu-project.org/boundaryanalysis)
            // Moreover, if there are lots of whitespace between a sentence and the following,
            // ICU puts the boundary after all the whitespace, just before the first char of
            // the next sentence. We want the boundary before these whitespaces, and so this loop.
            while (boundary > 0 && Character.isWhitespace(text.codePointAt(boundary - 1))) {
                boundary--;
            }
            
			// match the number of characters that ICU4J specifies as the break position
			// starting from the beginning of the string
			Rule r = new Rule(String.format("^(.|\\s){%d}", boundary), "", true);
			pattern = d.generateRuleRegex(r);
			pattern = pattern.replace(SRXDocument.ANYCODE, SRXDocument.INLINECODE_PATTERN);
			CompiledRule cr = new CompiledRule(pattern, true);
			rules.add(cr);
		}
		
		return rules;
	}

	private int calculatePosition(int position, int numberOfNonIsolatedCodes, int numberOfIsolatedCodes, boolean increase) {
		int nonIsolatedCodesLength = numberOfNonIsolatedCodes * CODE_MARKER_LENGTH;
		int isolatedCodesLength = numberOfIsolatedCodes
				* (treatIsolatedCodesAsWhitespace()
					? ISOLATED_CODE_REPLACEMENT_TEXT.length()
					: CODE_MARKER_LENGTH);

		return increase
				? position + nonIsolatedCodesLength + isolatedCodesLength
				: position - nonIsolatedCodesLength - isolatedCodesLength;
	}

	private int calculateIncreasedPosition(int position, int numberOfNonIsolatedCodes, int numberOfIsolatedCodes) {
		return calculatePosition(position, numberOfNonIsolatedCodes, numberOfIsolatedCodes, true);
	}

	private int calculateDecreasedPosition(int position, int numberOfNonIsolatedCodes, int numberOfIsolatedCodes) {
		return calculatePosition(position, numberOfNonIsolatedCodes, numberOfIsolatedCodes, false);
	}

	// Package scope for tests.
	int recalcPos(String codedText, int pos, List codePositions, List origCodePositions) {
		int numberOfNonIsolatedCodes = 0;
		int numberOfIsolatedCodes = 0;

		for (int codeIndex = 0; codeIndex < codePositions.size(); codeIndex++) {

			if (codePositions.get(codeIndex) >= pos) {
				return calculateIncreasedPosition(pos, numberOfNonIsolatedCodes, numberOfIsolatedCodes);
			} else {
				switch (Marker.asEnum(codedText.charAt(origCodePositions.get(codeIndex)))) {
					case OPENING:
					case CLOSING:
						numberOfNonIsolatedCodes++;
						break;
					case ISOLATED:
						numberOfIsolatedCodes++;
						break;
					default:
						// skip UNKNOWN
						break;
				}
			}
		}

		return calculateIncreasedPosition(pos, numberOfNonIsolatedCodes, numberOfIsolatedCodes);
	}
	
	int recalcPosBack(String codedText, int pos, List origCodePositions) {
		int numberOfNonIsolatedCodes = 0;
		int numberOfIsolatedCodes = 0;

		for (Integer origCodePosition : origCodePositions) {

			if (origCodePosition >= pos) {
				return calculateDecreasedPosition(pos, numberOfNonIsolatedCodes, numberOfIsolatedCodes);
			} else {
				switch (Marker.asEnum(codedText.charAt(origCodePosition))) {
					case OPENING:
					case CLOSING:
						numberOfNonIsolatedCodes++;
						break;
					case ISOLATED:
						numberOfIsolatedCodes++;
						break;
					default:
						// skip UNKNOWN
						break;
				}
			}
		}

		return calculateDecreasedPosition(pos, numberOfNonIsolatedCodes, numberOfIsolatedCodes);
	}

	List storeCodePositions(String text) {
		List res = new ArrayList<>();
		int numberOfNonIsolatedCodes = 0;
		int numberOfIsolatedCodes = 0;

		for (int i = 0; i < text.length(); i++) {

			switch (Marker.asEnum(text.charAt(i))) {
				case OPENING:
				case CLOSING:
					res.add(calculateDecreasedPosition(i, numberOfNonIsolatedCodes, numberOfIsolatedCodes));
					numberOfNonIsolatedCodes++;
					i++; // skip index marker
					break;
				case ISOLATED:
					// Position of the code after code removal
					res.add(calculateDecreasedPosition(i, numberOfNonIsolatedCodes, numberOfIsolatedCodes));
					numberOfIsolatedCodes++;
					i++; // skip index marker
					break;
				default:
					// skip UNKNOWN
					break;
			}
		}

		return res;
	}
	
	List storeOriginalCodePositions(String text) {
		List res = new ArrayList();
		for (int i=0; i getSplitPositions () {
		
		if ( finalSplits == null ) {
			finalSplits = new ArrayList();
		}
		return Collections.unmodifiableList(finalSplits);
	}

	@Override
	public List getRanges () {
		ArrayList list = new ArrayList();
		if ( starts == null ) return null;
		for ( int i=0; i 0 ))
			maskRule = Pattern.compile(pattern, Pattern.UNICODE_CHARACTER_CLASS);
		else
			maskRule = null;
	}
	
	@Override
	public void setSegmentSubFlows(boolean segmentSubFlows) {
		this.segmentSubFlows = segmentSubFlows;
	}

	@Override
	public void setIncludeStartCodes(boolean includeStartCodes) {
		this.includeStartCodes = includeStartCodes;
	}

	@Override
	public void setIncludeEndCodes(boolean includeEndCodes) {
		this.includeEndCodes = includeEndCodes;
	}

	@Override
	public void setIncludeIsolatedCodes(boolean includeIsolatedCodes) {
		this.includeIsolatedCodes = includeIsolatedCodes;
	}

	@Override
	public void setOneSegmentIncludesAll(boolean oneSegmentIncludesAll) {
		this.oneSegmentIncludesAll = oneSegmentIncludesAll;
	}

	@Override
	public void setTrimLeadingWS(boolean trimLeadingWS) {
		this.trimLeadingWS = trimLeadingWS;
	}

	@Override
	public void setTrimTrailingWS(boolean trimTrailingWS) {
		this.trimTrailingWS = trimTrailingWS;
	}

	@Override
	public void setTrimCodes(boolean trimCodes) {
		this.trimCodes = trimCodes;
	}

	@Override
	public void setTreatIsolatedCodesAsWhitespace(boolean treatIsolatedCodesAsWhitespace) {
		this.treatIsolatedCodesAsWhitespace = treatIsolatedCodesAsWhitespace;
	}
}