All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bhu.nlp.sentence.SentencesHelper Maven / Gradle / Ivy

There is a newer version: 1.0.3
Show newest version
package org.bhu.nlp.sentence;

import java.util.ArrayList;
import java.util.List;

/**
 * 文本断句
 * 
 * @author Jackie
 * 
 */
 
public class SentencesHelper {
	
	public static List toSentenceList(String content) {
		return toSentenceList(content.toCharArray());
	}
	
	public static List toSentenceListNoChar(String content) {
		return toSentenceListNoChar(content.toCharArray());
	}

	private static List toSentenceList(char[] chars) {

		StringBuilder sb = new StringBuilder();

		List sentences = new ArrayList();

		for (int i = 0; i < chars.length; i++) {
			if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
				continue;
			}

			sb.append(chars[i]);
			switch (chars[i]) {
			case '.':
				if (i < chars.length - 1 && chars[i + 1] > 128) {
					insertIntoList(sb, sentences);
					sb = new StringBuilder();
				}
				break;
			case '…':
				insertIntoList(sb, sentences);
				sb = new StringBuilder("…");
				break;
			case '\t':
			case '。':
			case ';':
			case ';':
			case '!':
			case '!':
			case '?':
			case '?':
			case '、':
			case ' ':
			case '\n':
			case '\r':
				insertIntoList(sb, sentences);
				sb = new StringBuilder();
				break;
			}
		}

		if (sb.length() > 0) {
			insertIntoList(sb, sentences);
		}

		return sentences;
	}

	private static void insertIntoList(StringBuilder sb, List sentences) {
		String content = sb.toString().trim();
		if (content.length() > 0) {
			sentences.add(content);
		}
	}
	
	private static List toSentenceListNoChar(char[] chars) {

		StringBuilder sb = new StringBuilder();

		List sentences = new ArrayList();

		for (int i = 0; i < chars.length; i++) {
			if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
				continue;
			}

			
			switch (chars[i]) {
			case '.':
				if (i < chars.length - 1 && chars[i + 1] > 128) {
					insertIntoList(sb, sentences);
					sb = new StringBuilder();
				}
				break;
			case '…':
			case '\t':
			case ',':
			case ',':
			case '。':
			case ';':
			case '"':
			case '、':
			case ':':
			case ':':
			case ';':
			case '“':
			case '”':
			case '!':
			case '!':
			case '?':
			case '?':
			case ' ':
			case '\n':
			case '\r':
				insertIntoList(sb, sentences);
				sb = new StringBuilder();
				continue;
			}
			sb.append(chars[i]);
		}
		
		if (sb.length() > 0) {
			insertIntoList(sb, sentences);
		}

		return sentences;
	}



}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy