org.nlpcn.commons.lang.standardization.SentencesUtil Maven / Gradle / Ivy
package org.nlpcn.commons.lang.standardization;
import java.util.ArrayList;
import java.util.List;
/**
* 文本断句
*
* @author ansj
*
*/
public class SentencesUtil {
public List toSentenceList(String content) {
return toSentenceList(content.toCharArray());
}
public List toSentenceList(char[] chars) {
StringBuilder sb = new StringBuilder();
List sentences = new ArrayList();
for (int i = 0; i < chars.length; i++) {
if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
continue;
}
sb.append(chars[i]);
switch (chars[i]) {
case '.':
if (i < chars.length - 1 && chars[i + 1] > 128) {
insertIntoList(sb, sentences);
sb = new StringBuilder();
}
break;
case '…':
insertIntoList(sb, sentences);
sb = new StringBuilder("…");
break;
case '\t':
case '。':
case ';':
case ';':
case '!':
case '!':
case '?':
case '?':
case '\n':
case '\r':
insertIntoList(sb, sentences);
sb = new StringBuilder();
break;
}
}
if (sb.length() > 0) {
insertIntoList(sb, sentences);
}
return sentences;
}
private void insertIntoList(StringBuilder sb, List sentences) {
String content = sb.toString().trim();
if (content.length() > 0) {
sentences.add(content);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy