lv.semti.morphology.analyzer.Splitting Maven / Gradle / Ivy
                 Go to download
                
        
                    Show more of this group  Show more artifacts with this name
Show all versions of morphology Show documentation
                Show all versions of morphology Show documentation
Latvian morphological analysis library
                
             The newest version!
        
        /*******************************************************************************
 * Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Pēteris Paikens, Imants Borodkins
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see  .
 *******************************************************************************/
package lv.semti.morphology.analyzer;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;
import lv.semti.morphology.attributes.AttributeNames;
import org.apache.commons.lang3.StringUtils;
/**
 * Tools for detecting chunk and token bounds.
 * When you edit this, please, update tokenize.pl in Chunker, too!
 */
public class Splitting {
	// Vārdā, atdalītājā, atstarpē
	private enum Status {IN_WORD, IN_SPACE}; // , IN_DELIMITER, IN_EXCEPTION - tagad šo funkcionalitāti dara Trie morphoAnalyzer.automats
	
	public static int DEFAULT_SENTENCE_LENGTH_CAP = 250;
	
	/**
	 * Determine, if given word should split a chunk (ends a sentence, like a period or exclamation mark)
	 */
	public static boolean isChunkCloser(Word word) {
		return word.hasAttribute(AttributeNames.i_PieturziimesTips, AttributeNames.v_Punkts); // pieņemam, ka tikai 'zs' tags ir teikuma beigas - tur ir punkts, jautājumzīme, izsaukumzīme, daudzpunkte un to kombinācijas/variācijas.
	}
		
	public static boolean isSeparator(char c)
	{
		String separators=" \t\n\r\u00A0\u2029\u200B.?:/!,;\"'`´(){}<>«»-+[]—‐‑‒–―‘’‚‛“”„‟′″‴‵‶‷‹›‼‽⁈⁉․‥…&•*";
		return separators.contains(String.valueOf(c));
	}
	/**
	 * Determine if given char is a whitespace char (space, tab, newline).
	 */
	public static boolean isSpace(char c)
	{
	    return Character.isWhitespace(c) || Character.isISOControl(c) || c == '\u00A0' || c == '\uFEFF' || c == '\u2029' || c == '\u200B';
	}
	private static Word formToken(Analyzer morphoAnalyzer, String str, int start, int end, StringBuilder accumulatedWhitespace) {
	    String word = str.substring(start, end);
	    word = word.replace("\u00AD", ""); // Soft hyphen gets removed from word before analysis
        Word token = (morphoAnalyzer == null) ? new Word(word) : morphoAnalyzer.analyze(word);
        String whitespace = accumulatedWhitespace.toString().replace("\u200B", ""); // zero-width spaces are used as temporary separators
        int offset = start - StringUtils.countMatches(str.substring(0,start), '\u200B');
        for (Wordform wf : token.wordforms) {
            wf.addAttribute(AttributeNames.i_WhitespaceBefore, whitespace);
            wf.addAttribute(AttributeNames.i_Offset, Integer.toString(offset));
        }
        return token;
    }
	/*
	 * Tokenizes the string (sentence?) and runs morphoanalysis on each word.
	 */
	public static LinkedList tokenize(Analyzer morphoAnalyzer, String chunk) {
		LinkedList tokens = new LinkedList();
		if (chunk == null) return tokens;
		
		Trie automats = new Trie(morphoAnalyzer.automats);
	    //bug fix - pievienota beigās whitespace
		String str = chunk+" ";
		//workaround dubultapostrofu izvirtībai
        str = str.replaceAll("''", "\u200B''"); // FIXME mēs te mazliet izčakarējam accumulatedWhitespace on Offsetus
        //workaround teikuma beigu saīsinājumiem utt
        str = str.replaceAll("([\\p{L}\\d])\\.(\\p{Z})*$", "$1\u200B.$2"); // FIXME mēs te mazliet izčakarējam accumulatedWhitespace un offsetus
//        str = str.replaceAll("([\\d])\\.(\\p{Z})*$", "$1\u200B.$2"); // FIXME mēs te mazliet izčakarējam accumulatedWhitespace un offsetus
        // te tiek ciklā doti visi tekstā esošie vārdi uz morfoanalīzi.
        int progress = 0;
		boolean inApostrophes=false;
		Status statuss = Status.IN_SPACE;
		StringBuilder accumulatedWhitespace = new StringBuilder();
		int lastGoodEnd=0;
		boolean canEndInNextStep=false;
		
		for (int i = 0; i < str.length(); i++) {
			switch (statuss) {
			case IN_SPACE:
				if (!Splitting.isSpace(str.charAt(i))) {
					if (str.charAt(i)=='\'') inApostrophes=true;
					automats.reset(); //atjauno automāta stāvokli
					automats.findNextBranch(str.charAt(i)); //atrod pirmo derīgo zaru
					
					if(automats.status()>0) { //pārbauda vai automātā atrada meklēto simbolu
						//ja atrada
						statuss=Status.IN_WORD;
						progress=i;
						//pārbauda vai ar to var arī virkne beigties
						canEndInNextStep = (automats.status()==2);
					} else {
						//ja neatrada, pievieno vienu simbolu un mēģina vēl
                        tokens.add( formToken(morphoAnalyzer, str, i, i+1, accumulatedWhitespace));
					}
				} else {
				   accumulatedWhitespace.append(str.charAt(i));
                }
				break;
			case IN_WORD:
				//pārbauda vai ir atrastas potenciālās beigas
		        if (canEndInNextStep==true &&
			            (Splitting.isSeparator(str.charAt(i)) || !Character.isLetter((i>0 ? str.charAt(i-1) : 0) ) ) )
				{
					lastGoodEnd=i;
					if(str.charAt(i)=='\'' && inApostrophes) {
						tokens.add( formToken(morphoAnalyzer, str, progress, i, accumulatedWhitespace));
                        accumulatedWhitespace = new StringBuilder();
                        tokens.add( formToken(morphoAnalyzer, str, i, i+1, accumulatedWhitespace));
						inApostrophes=false;
						statuss=Status.IN_SPACE;
						break;
					}
				}
				canEndInNextStep=false;
				
				//mēģina atrast nākamo simbolu automātā
				if (automats.findNext(str.charAt(i))>0) { //ja atrada 
					//pārbauda vai ar to var arī virkne beigties
					if (automats.status()==2)
						canEndInNextStep=true;
				} else {
					//ja neatrada, pārbauda vai automāta darbības laikā tika atrasta potenciālā beigu pozīcija
					if (lastGoodEnd>progress) {
                        tokens.add( formToken(morphoAnalyzer, str, progress, lastGoodEnd, accumulatedWhitespace));
						i=lastGoodEnd-1;
						statuss = Status.IN_SPACE;
                        accumulatedWhitespace = new StringBuilder();
					} else {
						i=progress;
						//mēgina atrast nākamo derīgo zaru
						automats.nextBranch();
						automats.findNextBranch(str.charAt(i));
						if(automats.status()>0) { //pārbauda vai atrada meklēto simbolu
							//pārbauda vai ar to var arī virkne beigties
							if(automats.status()==2)
								canEndInNextStep=true;
						} else {
							//ja neatrada, pievieno simbolu rezultātam un pēc tam dosies meklēt jauno sākumu
                            //vispār šis ir fishy. FIXME
                            tokens.add( formToken(morphoAnalyzer, str, i,i+1, accumulatedWhitespace));
							statuss = Status.IN_SPACE;
                            accumulatedWhitespace = new StringBuilder();
						}
					}
				}				
				break;
			}
		} // for i..
		
		
		if (statuss == Status.IN_WORD) {
			tokens.add( formToken(morphoAnalyzer, str, progress, str.length(), accumulatedWhitespace));
		}
		tokens = new LinkedList<>(tokens.stream().filter(s -> s.getToken().length() > 0).collect(Collectors.toList()));
		return tokens;
	}
	
	
	/***
	 * Tokenizes some text (usually a sentence)
	 * @param morphoAnalyzer
	 * @param chunk
	 * @param bruteSplit
	 * @return
	 */
	public static LinkedList tokenize(Analyzer morphoAnalyzer, String chunk, boolean bruteSplit) {
		if(bruteSplit)
		{
			LinkedList tokens = new LinkedList();
			if (chunk == null) return tokens;
			String[] parts_of_string = chunk.trim().split(" ");
			for(String part : parts_of_string) 
			{
				if (part.length()>0)
					tokens.add( (morphoAnalyzer == null) ? 
						new Word(part) :
						morphoAnalyzer.analyze(part));
			}
			return tokens;
		}
		else
		{
			return tokenize(morphoAnalyzer, chunk);
		}
	}
	public static LinkedList> tokenizeSentences(
			Analyzer morphoAnalyzer, String paragraph) {
		return tokenizeSentences(morphoAnalyzer, paragraph, DEFAULT_SENTENCE_LENGTH_CAP);
	}
	/***
	 * Tokenizes a paragraph, and splits it into sentences.
	 * @param morphoAnalyzer
	 * @param paragraph
	 * @return
	 */
	public static LinkedList> tokenizeSentences(
		 	Analyzer morphoAnalyzer, String paragraph, int lengthCap) {
		LinkedList> result = new LinkedList>();
		
		List tokens = Splitting.tokenize(morphoAnalyzer, paragraph);
		LinkedList sentence = new LinkedList();
		for (Word word : tokens) {
		    // Teikumu beigas iekšā tiešajā runā - pievelkam pēdiņu klāt
            if (sentence.size() == 0) {
                if ((word.isRecognized() && word.getBestWordform().getTag().equalsIgnoreCase("zq")) || word.getToken().equals(")")) {
                    // Pārbaudam vai iepriekšējā teikuma beigās ir punkts
                    if (!result.isEmpty() && !result.getLast().isEmpty()) {
                        String prevtoken = result.getLast().getLast().getToken();
                        if (prevtoken.equals(".") || prevtoken.equals("!") || prevtoken.equals("?") || prevtoken.equals("\"")) {
                            result.getLast().add(word);
                            continue;
                        }
                    }
                }
            }
			sentence.add(word);
			if ( Splitting.isChunkCloser(word) || // does this token look like end of sentence
				(sentence.size() >= lengthCap-5 && (word.hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Punctuation) || word.getToken().startsWith("<")) )
				|| sentence.size() > lengthCap) { 		// hard limit		
				result.add(sentence);
				sentence = new LinkedList();
			}
		}
		
		if (!sentence.isEmpty()) 
			result.add(sentence);
		return result;
	}
}
                  © 2015 - 2025 Weber Informatics LLC | Privacy Policy