lv.semti.morphology.analyzer.Splitting Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens, Imants Borodkins
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.semti.morphology.analyzer;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;
import lv.semti.morphology.attributes.AttributeNames;
import org.apache.commons.lang3.StringUtils;
/**
* Tools for detecting chunk and token bounds.
* When you edit this, please, update tokenize.pl in Chunker, too!
*/
public class Splitting {
// Vārdā, atdalītājā, atstarpē
private enum Status {IN_WORD, IN_SPACE}; // , IN_DELIMITER, IN_EXCEPTION - tagad šo funkcionalitāti dara Trie morphoAnalyzer.automats
public static int DEFAULT_SENTENCE_LENGTH_CAP = 250;
/**
* Determine, if given word should split a chunk (ends a sentence, like a period or exclamation mark)
*/
public static boolean isChunkCloser(Word word) {
return word.hasAttribute(AttributeNames.i_PieturziimesTips, AttributeNames.v_Punkts); // pieņemam, ka tikai 'zs' tags ir teikuma beigas - tur ir punkts, jautājumzīme, izsaukumzīme, daudzpunkte un to kombinācijas/variācijas.
}
public static boolean isSeparator(char c)
{
String separators=" \t\n\r\u00A0\u2029\u200B.?:/!,;\"'`´(){}<>«»-+[]—‐‑‒–―‘’‚‛“”„‟′″‴‵‶‷‹›‼‽⁈⁉․‥…&•*";
return separators.contains(String.valueOf(c));
}
/**
* Determine if given char is a whitespace char (space, tab, newline).
*/
public static boolean isSpace(char c)
{
return Character.isWhitespace(c) || Character.isISOControl(c) || c == '\u00A0' || c == '\uFEFF' || c == '\u2029' || c == '\u200B';
}
private static Word formToken(Analyzer morphoAnalyzer, String str, int start, int end, StringBuilder accumulatedWhitespace) {
String word = str.substring(start, end);
word = word.replace("\u00AD", ""); // Soft hyphen gets removed from word before analysis
Word token = (morphoAnalyzer == null) ? new Word(word) : morphoAnalyzer.analyze(word);
String whitespace = accumulatedWhitespace.toString().replace("\u200B", ""); // zero-width spaces are used as temporary separators
int offset = start - StringUtils.countMatches(str.substring(0,start), '\u200B');
for (Wordform wf : token.wordforms) {
wf.addAttribute(AttributeNames.i_WhitespaceBefore, whitespace);
wf.addAttribute(AttributeNames.i_Offset, Integer.toString(offset));
}
return token;
}
/*
* Tokenizes the string (sentence?) and runs morphoanalysis on each word.
*/
public static LinkedList tokenize(Analyzer morphoAnalyzer, String chunk) {
LinkedList tokens = new LinkedList();
if (chunk == null) return tokens;
Trie automats = new Trie(morphoAnalyzer.automats);
//bug fix - pievienota beigās whitespace
String str = chunk+" ";
//workaround dubultapostrofu izvirtībai
str = str.replaceAll("''", "\u200B''"); // FIXME mēs te mazliet izčakarējam accumulatedWhitespace on Offsetus
//workaround teikuma beigu saīsinājumiem utt
str = str.replaceAll("([\\p{L}\\d])\\.(\\p{Z})*$", "$1\u200B.$2"); // FIXME mēs te mazliet izčakarējam accumulatedWhitespace un offsetus
// str = str.replaceAll("([\\d])\\.(\\p{Z})*$", "$1\u200B.$2"); // FIXME mēs te mazliet izčakarējam accumulatedWhitespace un offsetus
// te tiek ciklā doti visi tekstā esošie vārdi uz morfoanalīzi.
int progress = 0;
boolean inApostrophes=false;
Status statuss = Status.IN_SPACE;
StringBuilder accumulatedWhitespace = new StringBuilder();
int lastGoodEnd=0;
boolean canEndInNextStep=false;
for (int i = 0; i < str.length(); i++) {
switch (statuss) {
case IN_SPACE:
if (!Splitting.isSpace(str.charAt(i))) {
if (str.charAt(i)=='\'') inApostrophes=true;
automats.reset(); //atjauno automāta stāvokli
automats.findNextBranch(str.charAt(i)); //atrod pirmo derīgo zaru
if(automats.status()>0) { //pārbauda vai automātā atrada meklēto simbolu
//ja atrada
statuss=Status.IN_WORD;
progress=i;
//pārbauda vai ar to var arī virkne beigties
canEndInNextStep = (automats.status()==2);
} else {
//ja neatrada, pievieno vienu simbolu un mēģina vēl
tokens.add( formToken(morphoAnalyzer, str, i, i+1, accumulatedWhitespace));
}
} else {
accumulatedWhitespace.append(str.charAt(i));
}
break;
case IN_WORD:
//pārbauda vai ir atrastas potenciālās beigas
if (canEndInNextStep==true &&
(Splitting.isSeparator(str.charAt(i)) || !Character.isLetter((i>0 ? str.charAt(i-1) : 0) ) ) )
{
lastGoodEnd=i;
if(str.charAt(i)=='\'' && inApostrophes) {
tokens.add( formToken(morphoAnalyzer, str, progress, i, accumulatedWhitespace));
accumulatedWhitespace = new StringBuilder();
tokens.add( formToken(morphoAnalyzer, str, i, i+1, accumulatedWhitespace));
inApostrophes=false;
statuss=Status.IN_SPACE;
break;
}
}
canEndInNextStep=false;
//mēģina atrast nākamo simbolu automātā
if (automats.findNext(str.charAt(i))>0) { //ja atrada
//pārbauda vai ar to var arī virkne beigties
if (automats.status()==2)
canEndInNextStep=true;
} else {
//ja neatrada, pārbauda vai automāta darbības laikā tika atrasta potenciālā beigu pozīcija
if (lastGoodEnd>progress) {
tokens.add( formToken(morphoAnalyzer, str, progress, lastGoodEnd, accumulatedWhitespace));
i=lastGoodEnd-1;
statuss = Status.IN_SPACE;
accumulatedWhitespace = new StringBuilder();
} else {
i=progress;
//mēgina atrast nākamo derīgo zaru
automats.nextBranch();
automats.findNextBranch(str.charAt(i));
if(automats.status()>0) { //pārbauda vai atrada meklēto simbolu
//pārbauda vai ar to var arī virkne beigties
if(automats.status()==2)
canEndInNextStep=true;
} else {
//ja neatrada, pievieno simbolu rezultātam un pēc tam dosies meklēt jauno sākumu
//vispār šis ir fishy. FIXME
tokens.add( formToken(morphoAnalyzer, str, i,i+1, accumulatedWhitespace));
statuss = Status.IN_SPACE;
accumulatedWhitespace = new StringBuilder();
}
}
}
break;
}
} // for i..
if (statuss == Status.IN_WORD) {
tokens.add( formToken(morphoAnalyzer, str, progress, str.length(), accumulatedWhitespace));
}
tokens = new LinkedList<>(tokens.stream().filter(s -> s.getToken().length() > 0).collect(Collectors.toList()));
return tokens;
}
/***
* Tokenizes some text (usually a sentence)
* @param morphoAnalyzer
* @param chunk
* @param bruteSplit
* @return
*/
public static LinkedList tokenize(Analyzer morphoAnalyzer, String chunk, boolean bruteSplit) {
if(bruteSplit)
{
LinkedList tokens = new LinkedList();
if (chunk == null) return tokens;
String[] parts_of_string = chunk.trim().split(" ");
for(String part : parts_of_string)
{
if (part.length()>0)
tokens.add( (morphoAnalyzer == null) ?
new Word(part) :
morphoAnalyzer.analyze(part));
}
return tokens;
}
else
{
return tokenize(morphoAnalyzer, chunk);
}
}
public static LinkedList> tokenizeSentences(
Analyzer morphoAnalyzer, String paragraph) {
return tokenizeSentences(morphoAnalyzer, paragraph, DEFAULT_SENTENCE_LENGTH_CAP);
}
/***
* Tokenizes a paragraph, and splits it into sentences.
* @param morphoAnalyzer
* @param paragraph
* @return
*/
public static LinkedList> tokenizeSentences(
Analyzer morphoAnalyzer, String paragraph, int lengthCap) {
LinkedList> result = new LinkedList>();
List tokens = Splitting.tokenize(morphoAnalyzer, paragraph);
LinkedList sentence = new LinkedList();
for (Word word : tokens) {
// Teikumu beigas iekšā tiešajā runā - pievelkam pēdiņu klāt
if (sentence.size() == 0) {
if ((word.isRecognized() && word.getBestWordform().getTag().equalsIgnoreCase("zq")) || word.getToken().equals(")")) {
// Pārbaudam vai iepriekšējā teikuma beigās ir punkts
if (!result.isEmpty() && !result.getLast().isEmpty()) {
String prevtoken = result.getLast().getLast().getToken();
if (prevtoken.equals(".") || prevtoken.equals("!") || prevtoken.equals("?") || prevtoken.equals("\"")) {
result.getLast().add(word);
continue;
}
}
}
}
sentence.add(word);
if ( Splitting.isChunkCloser(word) || // does this token look like end of sentence
(sentence.size() >= lengthCap-5 && (word.hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Punctuation) || word.getToken().startsWith("<")) )
|| sentence.size() > lengthCap) { // hard limit
result.add(sentence);
sentence = new LinkedList();
}
}
if (!sentence.isEmpty())
result.add(sentence);
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy