All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dev.langchain4j.data.document.splitter.HierarchicalDocumentSplitter Maven / Gradle / Ivy

Go to download

Java implementation of LangChain: Integrate your Java application with countless AI tools and services smoothly

There is a newer version: 0.36.2
Show newest version
package dev.langchain4j.data.document.splitter;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
import lombok.Getter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import static dev.langchain4j.internal.Utils.firstChars;
import static dev.langchain4j.internal.ValidationUtils.*;

/**
 * Base class for hierarchical document splitters.
 *
 * 

Extends {@link DocumentSplitter} and provides machinery for sub-splitting documents * when a single segment is too long. */ public abstract class HierarchicalDocumentSplitter implements DocumentSplitter { @Getter(lazy = true) private final HierarchicalDocumentSplitter overlapSentenceSplitter = new DocumentBySentenceSplitter(1, 0, null, null); private static final String INDEX = "index"; protected final int maxSegmentSize; protected final int maxOverlapSize; protected final Tokenizer tokenizer; protected final DocumentSplitter subSplitter; /** * Creates a new instance of {@link HierarchicalDocumentSplitter}. * @param maxSegmentSizeInChars The maximum size of a segment in characters. * @param maxOverlapSizeInChars The maximum size of the overlap between segments in characters. */ protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars) { this(maxSegmentSizeInChars, maxOverlapSizeInChars, null, null); } /** * Creates a new instance of {@link HierarchicalDocumentSplitter}. * @param maxSegmentSizeInChars The maximum size of a segment in characters. * @param maxOverlapSizeInChars The maximum size of the overlap between segments in characters. * @param subSplitter The sub-splitter to use when a single segment is too long. */ protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars, HierarchicalDocumentSplitter subSplitter) { this(maxSegmentSizeInChars, maxOverlapSizeInChars, null, subSplitter); } /** * Creates a new instance of {@link HierarchicalDocumentSplitter}. * @param maxSegmentSizeInTokens The maximum size of a segment in tokens. * @param maxOverlapSizeInTokens The maximum size of the overlap between segments in tokens. * @param tokenizer The tokenizer to use to estimate the number of tokens in a text. */ protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens, int maxOverlapSizeInTokens, Tokenizer tokenizer) { this(maxSegmentSizeInTokens, maxOverlapSizeInTokens, tokenizer, null); } /** * Creates a new instance of {@link HierarchicalDocumentSplitter}. * @param maxSegmentSizeInTokens The maximum size of a segment in tokens. * @param maxOverlapSizeInTokens The maximum size of the overlap between segments in tokens. * @param tokenizer The tokenizer to use to estimate the number of tokens in a text. * @param subSplitter The sub-splitter to use when a single segment is too long. */ protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens, int maxOverlapSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) { this.maxSegmentSize = ensureGreaterThanZero(maxSegmentSizeInTokens, "maxSegmentSize"); this.maxOverlapSize = ensureBetween(maxOverlapSizeInTokens, 0, maxSegmentSize, "maxOverlapSize"); this.tokenizer = tokenizer; this.subSplitter = subSplitter == null ? defaultSubSplitter() : subSplitter; } /** * Splits the provided text into parts. * Implementation API. * @param text The text to be split. * @return An array of parts. */ protected abstract String[] split(String text); /** * Delimiter string to use to re-join the parts. * @return The delimiter. */ protected abstract String joinDelimiter(); /** * The default sub-splitter to use when a single segment is too long. * @return The default sub-splitter. */ protected abstract DocumentSplitter defaultSubSplitter(); @Override public List split(Document document) { ensureNotNull(document, "document"); List segments = new ArrayList<>(); SegmentBuilder segmentBuilder = new SegmentBuilder(maxSegmentSize, this::estimateSize, joinDelimiter()); AtomicInteger index = new AtomicInteger(0); String[] parts = split(document.text()); String overlap = null; for (String part : parts) { int partSize = segmentBuilder.sizeOf(part); if (segmentBuilder.hasSpaceFor(partSize)) { // The part fits in the current segment, so we append it. segmentBuilder.append(part); continue; } if (segmentBuilder.isNotEmpty()) { // The part won't fit in the current segment, so we flush the current segment. String segmentText = segmentBuilder.toString(); if (!segmentText.equals(overlap)) { segments.add(createSegment(segmentText, document, index.getAndIncrement())); overlap = overlapFrom(segmentText); segmentBuilder.reset(); segmentBuilder.append(overlap); if (segmentBuilder.hasSpaceFor(partSize)) { // The part fits in the current segment, so we append it. segmentBuilder.append(part); continue; } } } // Enforce that we have a sub-splitter defined. if (subSplitter == null) { throw new RuntimeException(String.format( "The text \"%s...\" (%s %s long) doesn't fit into the maximum segment size (%s %s), " + "and there is no subSplitter defined to split it further.", firstChars(part, 30), estimateSize(part), tokenizer == null ? "characters" : "tokens", maxSegmentSize, tokenizer == null ? "characters" : "tokens" )); } // Delegate the splitting of the part to the sub-splitter. segmentBuilder.append(part); for (TextSegment segment : subSplitter.split(Document.from(segmentBuilder.toString()))) { segments.add(createSegment(segment.text(), document, index.getAndIncrement())); } TextSegment lastSegment = segments.get(segments.size() - 1); overlap = overlapFrom(lastSegment.text()); segmentBuilder.reset(); segmentBuilder.append(overlap); } if (segmentBuilder.isNotEmpty() && !segmentBuilder.toString().equals(overlap)) { segments.add(createSegment(segmentBuilder.toString(), document, index.getAndIncrement())); } return segments; } /** * Returns the overlap region at the end of the provided segment text. * @param segmentText The segment text. * @return The overlap region, or an empty string if there is no overlap. */ String overlapFrom(String segmentText) { if (maxOverlapSize == 0) { return ""; } // always split by sentence, as it is the smallest meaningful unit of text List sentences = Arrays.asList(getOverlapSentenceSplitter().split(segmentText)); Collections.reverse(sentences); SegmentBuilder overlapBuilder = new SegmentBuilder(maxOverlapSize, this::estimateSize, joinDelimiter()); for (String sentence : sentences) { if (overlapBuilder.hasSpaceFor(sentence)) { overlapBuilder.prepend(sentence); } else { break; } } return overlapBuilder.toString(); } /** * Estimates the size in the provided text. * *

If a {@link Tokenizer} is provided, the number of tokens is estimated. * Otherwise, the number of characters is estimated. * * @param text The text. * @return The estimated number of tokens. */ int estimateSize(String text) { if (tokenizer != null) { return tokenizer.estimateTokenCountInText(text); } else { return text.length(); } } /** * Creates a new {@link TextSegment} from the provided text and document. * *

The segment inherits all metadata from the document. The segment also includes * an "index" metadata key representing the segment position within the document. * * @param text The text of the segment. * @param document The document to which the segment belongs. * @param index The index of the segment within the document. */ static TextSegment createSegment(String text, Document document, int index) { Metadata metadata = document.metadata().copy().put(INDEX, String.valueOf(index)); return TextSegment.from(text, metadata); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy