com.norconex.commons.lang.io.TextReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of norconex-commons-lang Show documentation
Show all versions of norconex-commons-lang Show documentation
Norconex Commons Lang is a Java library containing utility classes that complements the Java API and are not found in commonly available libraries (such as the great Apache Commons Lang, which it relies on).
/* Copyright 2015-2016 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.norconex.commons.lang.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
/**
* Reads text form an input stream, splitting it wisely whenever the text
* is too large. First tries to split after the last paragraph. If there
* are no paragraph, it tries to split after the last sentence. If no sentence
* can be detected, it splits on the last word. If no words are found,
* it returns all it could read up to the maximum read size.
* @author Pascal Essiembre
* @since 1.6.0
*/
public class TextReader extends Reader {
public static final int DEFAULT_MAX_READ_SIZE =
(int) (FileUtils.ONE_KB * 64);
private final BufferedReader reader;
private final int maxReadSize;
private final boolean removeTrailingDelimiter;
private final StringBuilder buffer = new StringBuilder();
private static final int PATTERN_FLAGS =
Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS;
private static final Pattern PARAGRAPH_PATTERN = Pattern.compile(
"^.*(\\p{javaWhitespace}*[\\n\\r]\\p{javaWhitespace}*?"
+ "[\\n\\r]\\p{javaWhitespace}*)", PATTERN_FLAGS);
private static final Pattern SENTENCE_PATTERN = Pattern.compile(
"^.*[\\.\\?\\!](\\p{javaWhitespace}+|$)", PATTERN_FLAGS);
private static final Pattern WORD_PATTERN = Pattern.compile(
"^.*(\\p{javaWhitespace}+)", PATTERN_FLAGS);
/**
* Create a new text reader, reading 64KB at a time with
* {@link #readText()} is called.
* @param reader a Reader
*/
public TextReader(Reader reader) {
this(reader, DEFAULT_MAX_READ_SIZE);
}
/**
* Constructor.
* @param reader a Reader
* @param maxReadSize maximum to read at once with {@link #readText()}.
*/
public TextReader(Reader reader, int maxReadSize) {
this(reader, maxReadSize, false);
}
/**
* Constructor.
* @param reader a Reader
* @param maxReadSize maximum to read at once with {@link #readText()}.
* @param removeTrailingDelimiter whether to remove trailing delimiter
*/
public TextReader(Reader reader, int maxReadSize,
boolean removeTrailingDelimiter) {
super();
this.maxReadSize = maxReadSize;
this.reader = IOUtil.toBufferedReader(reader);
this.removeTrailingDelimiter = removeTrailingDelimiter;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return reader.read(cbuf, off, len);
}
/**
* Reads the next chunk of text, up to the maximum read size specified.
* It tries as much as possible to break long text into paragraph,
* sentences or words, before returning. See class documentation.
* @return text read
* @throws IOException problem reading text.
*/
public String readText() throws IOException {
char[] text = new char[maxReadSize - buffer.length()];
int num = reader.read(text);
if (num == -1) {
return null;
}
buffer.append(String.valueOf(text, 0, num));
// Return all if we reached the end.
reader.mark(1);
if (reader.read() == -1) {
String t = buffer.toString();
buffer.setLength(0);
reader.reset();
return t;
} else {
reader.reset();
}
Matcher m;
// Try breaking at paragraph:
m = PARAGRAPH_PATTERN.matcher(buffer);
if(m.find()) {
int mStart = m.start(1);
int mEnd = m.end(1);
int substringEnd = mEnd;
if (removeTrailingDelimiter) {
substringEnd = mStart;
}
String t = buffer.substring(0, substringEnd);
buffer.delete(0, substringEnd);
return t;
}
// Try breaking at sentence:
m = SENTENCE_PATTERN.matcher(buffer);
if(m.find()) {
int mStart = m.start(1);
int mEnd = m.end(1);
int substringEnd = mEnd;
if (removeTrailingDelimiter) {
substringEnd = mStart;
}
String t = buffer.substring(0, substringEnd);
buffer.delete(0, substringEnd);
return t;
}
// Try breaking at word:
m = WORD_PATTERN.matcher(buffer);
if(m.find()) {
int mStart = m.start(1);
int mEnd = m.end(1);
int substringEnd = mEnd;
if (removeTrailingDelimiter) {
substringEnd = mStart;
}
String t = buffer.substring(0, substringEnd);
buffer.delete(0, substringEnd);
return t;
}
String t = buffer.toString();
buffer.setLength(0);
return t;
}
@Override
public void close() throws IOException {
reader.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy