All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.nlpcn.commons.lang.standardization.SentencesUtil Maven / Gradle / Ivy

package org.nlpcn.commons.lang.standardization;

import java.util.ArrayList;
import java.util.List;

/**
 * 文本断句
 * 
 * @author ansj
 * 
 */
public class SentencesUtil {
	public List toSentenceList(String content) {
		return toSentenceList(content.toCharArray());
	}

	public List toSentenceList(char[] chars) {

		StringBuilder sb = new StringBuilder();

		List sentences = new ArrayList();

		for (int i = 0; i < chars.length; i++) {
			if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
				continue;
			}

			sb.append(chars[i]);
			switch (chars[i]) {
			case '.':
				if (i < chars.length - 1 && chars[i + 1] > 128) {
					insertIntoList(sb, sentences);
					sb = new StringBuilder();
				}
				break;
			case '…':
				insertIntoList(sb, sentences);
				sb = new StringBuilder("…");
				break;
			case '\t':
			case '。':
			case ';':
			case ';':
			case '!':
			case '!':
			case '?':
			case '?':
			case '\n':
			case '\r':
				insertIntoList(sb, sentences);
				sb = new StringBuilder();
				break;
			}
		}

		if (sb.length() > 0) {
			insertIntoList(sb, sentences);
		}

		return sentences;
	}

	private void insertIntoList(StringBuilder sb, List sentences) {
		String content = sb.toString().trim();
		if (content.length() > 0) {
			sentences.add(content);
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy