All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.nlp.tokenizer.SimpleParagraphSplitter Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.nlp.tokenizer;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This is a simple paragraph splitter. Given a string, it returns a list of
 * strings, where each element is a paragraph.
 * 

* The beginning of a paragraph is indicated by *

    *
  • the beginning of the content, that is, the paragraph is the first * content in the document, or *
  • exactly one blank line preceding the paragraph text *
* The end of a paragraph is indicated by *
    *
  • the end of the content, that is, the paragraph is the last content in * the document, or *
  • one or more blank lines following the paragraph text *
* A blank line contains zero or more non-printing characters, such as * space or tab, followed by a new line. * * @author Haifeng Li */ public class SimpleParagraphSplitter implements ParagraphSplitter { /** * Remove whitespaces in an blank line. Note to turn multiline mode. */ private static Pattern REGEX_BLANK_LINE = Pattern.compile("(?m)^\\s+$"); /** * Pattern to split paragraphs. Note that \u2029 is paragraph-separator character */ private static Pattern REGEX_PARAGRAPH = Pattern.compile("(\\n|(\\n\\r)|(\\r\\n)){2,}+|'\u2029+"); /** * The singleton instance for standard unweighted Euclidean distance. */ private static SimpleParagraphSplitter singleton = new SimpleParagraphSplitter(); /** * Constructor. */ private SimpleParagraphSplitter() { } /** * Returns the singleton instance. */ public static SimpleParagraphSplitter getInstance() { return singleton; } @Override public String[] split(String text) { Matcher matcher = REGEX_BLANK_LINE.matcher(text); text = matcher.replaceAll(""); return REGEX_PARAGRAPH.split(text); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy