dev.langchain4j.data.document.splitter.HierarchicalDocumentSplitter Maven / Gradle / Ivy
Show all versions of langchain4j Show documentation
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
import lombok.Getter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static dev.langchain4j.internal.Utils.firstChars;
import static dev.langchain4j.internal.ValidationUtils.*;
/**
* Base class for hierarchical document splitters.
*
* Extends {@link DocumentSplitter} and provides machinery for sub-splitting documents
* when a single segment is too long.
*/
public abstract class HierarchicalDocumentSplitter implements DocumentSplitter {
@Getter(lazy = true)
private final HierarchicalDocumentSplitter overlapSentenceSplitter =
new DocumentBySentenceSplitter(1, 0, null, null);
private static final String INDEX = "index";
protected final int maxSegmentSize;
protected final int maxOverlapSize;
protected final Tokenizer tokenizer;
protected final DocumentSplitter subSplitter;
/**
* Creates a new instance of {@link HierarchicalDocumentSplitter}.
* @param maxSegmentSizeInChars The maximum size of a segment in characters.
* @param maxOverlapSizeInChars The maximum size of the overlap between segments in characters.
*/
protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars) {
this(maxSegmentSizeInChars, maxOverlapSizeInChars, null, null);
}
/**
* Creates a new instance of {@link HierarchicalDocumentSplitter}.
* @param maxSegmentSizeInChars The maximum size of a segment in characters.
* @param maxOverlapSizeInChars The maximum size of the overlap between segments in characters.
* @param subSplitter The sub-splitter to use when a single segment is too long.
*/
protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
HierarchicalDocumentSplitter subSplitter) {
this(maxSegmentSizeInChars, maxOverlapSizeInChars, null, subSplitter);
}
/**
* Creates a new instance of {@link HierarchicalDocumentSplitter}.
* @param maxSegmentSizeInTokens The maximum size of a segment in tokens.
* @param maxOverlapSizeInTokens The maximum size of the overlap between segments in tokens.
* @param tokenizer The tokenizer to use to estimate the number of tokens in a text.
*/
protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
Tokenizer tokenizer) {
this(maxSegmentSizeInTokens, maxOverlapSizeInTokens, tokenizer, null);
}
/**
* Creates a new instance of {@link HierarchicalDocumentSplitter}.
* @param maxSegmentSizeInTokens The maximum size of a segment in tokens.
* @param maxOverlapSizeInTokens The maximum size of the overlap between segments in tokens.
* @param tokenizer The tokenizer to use to estimate the number of tokens in a text.
* @param subSplitter The sub-splitter to use when a single segment is too long.
*/
protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
Tokenizer tokenizer,
DocumentSplitter subSplitter) {
this.maxSegmentSize = ensureGreaterThanZero(maxSegmentSizeInTokens, "maxSegmentSize");
this.maxOverlapSize = ensureBetween(maxOverlapSizeInTokens, 0, maxSegmentSize, "maxOverlapSize");
this.tokenizer = tokenizer;
this.subSplitter = subSplitter == null ? defaultSubSplitter() : subSplitter;
}
/**
* Splits the provided text into parts.
* Implementation API.
* @param text The text to be split.
* @return An array of parts.
*/
protected abstract String[] split(String text);
/**
* Delimiter string to use to re-join the parts.
* @return The delimiter.
*/
protected abstract String joinDelimiter();
/**
* The default sub-splitter to use when a single segment is too long.
* @return The default sub-splitter.
*/
protected abstract DocumentSplitter defaultSubSplitter();
@Override
public List split(Document document) {
ensureNotNull(document, "document");
List segments = new ArrayList<>();
SegmentBuilder segmentBuilder = new SegmentBuilder(maxSegmentSize, this::estimateSize, joinDelimiter());
AtomicInteger index = new AtomicInteger(0);
String[] parts = split(document.text());
String overlap = null;
for (String part : parts) {
int partSize = segmentBuilder.sizeOf(part);
if (segmentBuilder.hasSpaceFor(partSize)) {
// The part fits in the current segment, so we append it.
segmentBuilder.append(part);
continue;
}
if (segmentBuilder.isNotEmpty()) {
// The part won't fit in the current segment, so we flush the current segment.
String segmentText = segmentBuilder.toString();
if (!segmentText.equals(overlap)) {
segments.add(createSegment(segmentText, document, index.getAndIncrement()));
overlap = overlapFrom(segmentText);
segmentBuilder.reset();
segmentBuilder.append(overlap);
if (segmentBuilder.hasSpaceFor(partSize)) {
// The part fits in the current segment, so we append it.
segmentBuilder.append(part);
continue;
}
}
}
// Enforce that we have a sub-splitter defined.
if (subSplitter == null) {
throw new RuntimeException(String.format(
"The text \"%s...\" (%s %s long) doesn't fit into the maximum segment size (%s %s), " +
"and there is no subSplitter defined to split it further.",
firstChars(part, 30),
estimateSize(part), tokenizer == null ? "characters" : "tokens",
maxSegmentSize, tokenizer == null ? "characters" : "tokens"
));
}
// Delegate the splitting of the part to the sub-splitter.
segmentBuilder.append(part);
for (TextSegment segment : subSplitter.split(Document.from(segmentBuilder.toString()))) {
segments.add(createSegment(segment.text(), document, index.getAndIncrement()));
}
TextSegment lastSegment = segments.get(segments.size() - 1);
overlap = overlapFrom(lastSegment.text());
segmentBuilder.reset();
segmentBuilder.append(overlap);
}
if (segmentBuilder.isNotEmpty() && !segmentBuilder.toString().equals(overlap)) {
segments.add(createSegment(segmentBuilder.toString(), document, index.getAndIncrement()));
}
return segments;
}
/**
* Returns the overlap region at the end of the provided segment text.
* @param segmentText The segment text.
* @return The overlap region, or an empty string if there is no overlap.
*/
String overlapFrom(String segmentText) {
if (maxOverlapSize == 0) {
return "";
}
// always split by sentence, as it is the smallest meaningful unit of text
List sentences = Arrays.asList(getOverlapSentenceSplitter().split(segmentText));
Collections.reverse(sentences);
SegmentBuilder overlapBuilder = new SegmentBuilder(maxOverlapSize, this::estimateSize, joinDelimiter());
for (String sentence : sentences) {
if (overlapBuilder.hasSpaceFor(sentence)) {
overlapBuilder.prepend(sentence);
} else {
break;
}
}
return overlapBuilder.toString();
}
/**
* Estimates the size in the provided text.
*
* If a {@link Tokenizer} is provided, the number of tokens is estimated.
* Otherwise, the number of characters is estimated.
*
* @param text The text.
* @return The estimated number of tokens.
*/
int estimateSize(String text) {
if (tokenizer != null) {
return tokenizer.estimateTokenCountInText(text);
} else {
return text.length();
}
}
/**
* Creates a new {@link TextSegment} from the provided text and document.
*
*
The segment inherits all metadata from the document. The segment also includes
* an "index" metadata key representing the segment position within the document.
*
* @param text The text of the segment.
* @param document The document to which the segment belongs.
* @param index The index of the segment within the document.
*/
static TextSegment createSegment(String text, Document document, int index) {
Metadata metadata = document.metadata().copy().put(INDEX, String.valueOf(index));
return TextSegment.from(text, metadata);
}
}