net.sf.saxon.str.ZenoString Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation

The XSLT and XQuery Processor

There is a newer version: 12.5

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.str;

import net.sf.saxon.expr.sort.EmptyIntIterator;
import net.sf.saxon.transpile.CSharpReplaceBody;
import net.sf.saxon.z.IntIterator;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.function.IntPredicate;

/**
 * A ZenoString is an implementation of UnicodeString that comprises a list
 * of segments representing substrings of the total string. By convention the
 * segments are not themselves ZenoStrings, so the structure is a shallow tree.
 * An index holds pointers to the segments and their offsets within the string
 * as a whole; this is used to locate the codepoint at any particular location
 * in the string.
 *
 * The segments will always be non-empty. An empty string contains no segments.
 *
 * The key to the performance of the data structure (and its name) is
 * the algorithm for consolidating segments when strings are concatenated,
 * so as to keep the number of segments increasing logarithmically with
 * the string size, with short segments at the extremities to allow efficient
 * further concatenation at the ends.
 *
 * For further details see the paper by Michael Kay at Balisage 2021.
 */

public class ZenoString extends UnicodeString {

    private List segments = new ArrayList<>();
    private List offsets = new ArrayList<>();

    /**
     * Private constructor creating an empty ZenoString (containing an empty list of segments)
     */

    private ZenoString() {}

    /**
     * Private constructor creating a ZenoString with a single segment
     */

    private ZenoString(UnicodeString content) {
        segments.add(content);
        offsets.add(0L);
    }

    /**
     * An empty ZenoString
     */

    public static final ZenoString EMPTY = new ZenoString();

    /**
     * Construct a ZenoString from a supplied UnicodeString
     * @param content the supplied UnicodeString
     * @return the resulting ZenoString
     */

    public static ZenoString of(UnicodeString content) {
         if (content instanceof ZenoString) {
             return (ZenoString) content;
         } else if (content.isEmpty()) {
             return new ZenoString();
         } else {
             return new ZenoString(content);
         }
    }

    /**
     * Get the index of the segment containing the character at a given offset in the string
     * @param offset the offset of the character in the string. This must be greater
     *               than or equal to zero, and less than length of the string.
     * @return the index of the segment containing the required character
     * @throws IndexOutOfBoundsException if the supplied offset is out of range.
     */

    private int segmentForOffset(long offset) {
        if (segments.size() == 0) {
            throw new IndexOutOfBoundsException("ZenoString is empty");
        }
        int result = binarySearch(offset, 0, offsets.size() - 1);
        if (result < 0) {
            throw new IndexOutOfBoundsException("Index " + offset + " out of range 0-" + (length()-1));
        }
        return result;
    }

    private int binarySearch(long offset, int start, int end) {
        //System.err.println("BinarySearch " + start + " " + end);
        if (start == end) {
            long s = offsets.get(start);
            long e = s + segments.get(start).length();
            if (s <= offset && e > offset) {
                return start;
            } else {
                return -1;
            }
        } else {
            int mid = start + (end - start + 1) / 2;
            if (offsets.get(mid) > offset) {
                return binarySearch(offset, start, mid-1);
            } else {
                return binarySearch(offset, mid, end);
            }
        }
    }


    /**
     * Get an iterator over the code points present in the string.
     * @return an iterator that delivers the individual code points
     */
    @Override
    @CSharpReplaceBody(code="return new Saxon.Impl.Overrides.ZenoStringCodepoints(segments);")
    public IntIterator codePoints() {
        if (isEmpty()) {
            return EmptyIntIterator.getInstance();
        }
        return new IntIterator() {
            final Iterator outerIterator = segments.iterator();
            IntIterator innerIterator;

            @Override
            public boolean hasNext() {
                if (innerIterator == null) {
                    return outerIterator.hasNext();
                } else if (innerIterator.hasNext()) {
                    return true;
                } else {
                    innerIterator = null;
                    return outerIterator.hasNext();
                }
            }

            @Override
            public int next() {
                if (innerIterator == null) {
                    if (outerIterator.hasNext()) {
                        innerIterator = outerIterator.next().codePoints();
                    } else {
                        throw new NoSuchElementException();
                    }
                }
                return innerIterator.next();
            }
        };
    }

    /**
     * Get the length of the string
     *
     * @return the number of code points in the string
     */
    @Override
    public long length() {
        int i = segments.size()-1;
        return i < 0 ? 0L : offsets.get(i) + segments.get(i).length();
    }

    /**
     * Ask whether the string is empty
     *
     * @return true if the length of the string is zero
     */
    @Override
    public boolean isEmpty() {
        return segments.isEmpty();
    }

    /**
     * Get the number of bits needed to hold all the characters in this string
     *
     * @return 7 for ascii characters, 8 for latin-1, 16 for BMP, 24 for general Unicode.
     */
    @Override
    public int getWidth() {
        int maxWidth = 7;
        for (UnicodeString entry : segments) {
            int width = entry.getWidth();
            if (width == 24) {
                return 24;
            } else {
                maxWidth = Math.max(maxWidth, width);
            }
        }
        return maxWidth;
    }

    /**
     * Get the position of the first occurrence of the specified codepoint,
     * starting the search at a given position in the string
     *
     * @param codePoint the sought codePoint
     * @param from      the position from which the search should start (0-based), in the
     *                  range 0 to length()-1
     * @return the position (0-based) of the first occurrence found, or -1 if not found
     * @throws IndexOutOfBoundsException if the from value is out of range
     */
    @Override
    public long indexOf(int codePoint, long from) {
        from = Math.max(from, 0);
        if (from >= length()) {
            return -1L;
        }
        int first = segmentForOffset(from);
        for (int i=first; i= 0) {
                return pos + offset;
            }
        }
        return -1;
    }

    @Override
    public long indexWhere(IntPredicate predicate, long from) {
        int first = segmentForOffset(from);
        for (int i = first; i < segments.size(); i++) {
            UnicodeString segment = segments.get(i);
            long offset = offsets.get(i);
            long pos = segment.indexWhere(predicate, i == first ? from - offset : 0);
            if (pos >= 0) {
                return pos + offset;
            }
        }
        return -1;
    }

    /**
     * Get the code point at a given position in the string
     * @param index the given position (0-based)
     * @throws IndexOutOfBoundsException if the index is out of range
     */
    @Override
    public int codePointAt(long index) {
        int entry = segmentForOffset(index);
        UnicodeString segment = segments.get(entry);
        return segment.codePointAt(index - offsets.get(entry));
    }

    /**
     * Get a substring of this codepoint sequence, with a given start and end position
     *
     * @param start the start position (0-based): that is, the position of the first
     *              code point to be included
     * @param end   the end position (0-based): specifically, the position of the first
     *              code point not to be included
     */
    @Override
    public UnicodeString substring(long start, long end) {
        checkSubstringBounds(start, end);
        if (start == end) {
            return EmptyUnicodeString.getInstance();
        } else if (start + 1 == end) {
            return new UnicodeChar(codePointAt(start));
        }
        int first = segmentForOffset(start);
        int last = segmentForOffset(end-1);
        if (first == last) {
            UnicodeString segment = segments.get(first);
            long offset = offsets.get(first);
            return segment.substring(start - offset, end - offset);
        } else {
            ZenoString z = ZenoString.of(segments.get(first).substring(start - offsets.get(first)));
            for (int i=first+1; i(segments);
            z.segments.addAll(((ZenoString)other).segments);
            z.offsets = new ArrayList<>(offsets);
            long len = length();
            for (long offset : ((ZenoString) other).offsets) {
                z.offsets.add(offset + len);
            }
            return (len < 32 || other.length() < 32 ? z.consolidate0() : z);
        } else {
            ZenoString z = new ZenoString();
            z.segments = new ArrayList<>(segments);
            z.offsets = new ArrayList<>(offsets);
            z.segments.add(other);
            z.offsets.add(length());
            return z.consolidate0();
        }
    }

    @Override
    void copy8bit(byte[] target, int offset) {
        for (UnicodeString us : segments) {
            us.copy8bit(target, offset);
            offset += us.length32();
        }
    }

    @Override
    void copy16bit(char[] target, int offset) {
        for (UnicodeString us : segments) {
            us.copy16bit(target, offset);
            offset += us.length32();
        }
    }

    @Override
    void copy24bit(byte[] target, int offset) {
        for (UnicodeString us : segments) {
            us.copy24bit(target, offset);
            offset += (us.length32() * 3);
        }
    }

    private ZenoString consolidate() {
        // internal, so works in-situ

        int i = segments.size()-2;
        long prevLength = segments.get(i+1).length();
        while (i >= 0) {
            long thisLength = segments.get(i).length();
            long nextLength = i == 0 ? 0 : segments.get(i-1).length();
            if ((thisLength <= prevLength && thisLength <= nextLength) || thisLength + prevLength <= 32) {
                segments.set(i, concatSegments(segments.get(i), segments.get(i + 1)));
                //Instrumentation.count("charCopy", segments.get(i).length32() + segments.get(i + 1).length32());
                //Instrumentation.count("copyOperations");
                segments.remove(i + 1);
                offsets.remove(i + 1);
                prevLength = segments.get(i).length();
            } else {
                prevLength = thisLength;
            }
            i--;
        }
        //showSegmentLengths();
        return this;
    }

    /**
     * Write each of the segments in turn to a UnicodeWriter
     * @param writer the writer to which the string is to be written
     */

    public void writeSegments(UnicodeWriter writer) throws IOException {
        for (UnicodeString str : segments) {
            writer.write(str);
        }
    }

    public static UnicodeString concatSegments(UnicodeString left, UnicodeString right) {
        if (left.getWidth() <= 8 && right.getWidth() <= 8) {
            byte[] newByteArray = new byte[left.length32() + right.length32()];
            left.copy8bit(newByteArray, 0);
            right.copy8bit(newByteArray, left.length32());
            return new Twine8(newByteArray);
        } else if (left.getWidth() <= 16 && right.getWidth() <= 16) {
            char[] newCharArray = new char[left.length32() + right.length32()];
            left.copy16bit(newCharArray, 0);
            right.copy16bit(newCharArray, left.length32());
            return new Twine16(newCharArray);
        } else {
            byte[] newByteArray = new byte[(left.length32() + right.length32()) * 3];
            left.copy24bit(newByteArray, 0);
            right.copy24bit(newByteArray, left.length32() * 3);
            return new Twine24(newByteArray);
        }
    }



    private ZenoString consolidate0() {
        // internal, so works in-situ

        for (int i=segments.size()-2; i>=0; i--) {
            double nextLength = segments.get(i + 1).length() * 1.1;
            if (segments.get(i).length() < nextLength) {
                segments.set(i, concatSegments(segments.get(i), segments.get(i+1)));
                segments.remove(i+1);
                offsets.remove(i+1);
            }
        }

        //verifySegmentLengths();
        return this;
    }

    private ZenoString consolidate1() {
        // internal, so works in-situ

        int halfway = segments.size()/2;
        for (int i=0; i<(halfway-1); i++) {
            if (segments.get(i).length() +segments.get(i+1).length() < (32L << i)) {
                UnicodeString merged = segments.get(i).concat(segments.get(i+1));
                segments.remove(i+1);
                offsets.remove(i+1);
                segments.set(i, merged);
            }
        }

        int distance = 0;
        for (int i = segments.size() - 1; i > halfway; i--) {
            if (segments.get(i).length() + segments.get(i-1).length() <  (32L << (distance++))) {
                UnicodeString merged = segments.get(i-1).concat(segments.get(i));
                segments.remove(i);
                offsets.remove(i);
                segments.set(i-1, merged);
            }
        }

        //showSegmentLengths();
        return this;
    }

    /**
     * Get an equivalent UnicodeString that uses the most economical representation available
     *
     * @return an equivalent UnicodeString
     */
    @Override
    public UnicodeString economize() {
        int segs = segments.size();
        if (segs == 0) {
            return EmptyUnicodeString.getInstance();
        } else if (segs == 1) {
            return segments.get(0);
        } else if (segs < 32 && length() < 256 && getWidth() <= 16) {
            // Return a single wrapped Java String, for economy of any subsequent toString() operations.
            return new BMPString(toString());
        } else {
            return this;
        }
    }

    public String toString() {
        StringBuilder sb = new StringBuilder();
        for (UnicodeString str : segments) {
            sb.append(str.toString());
        }
        return sb.toString();
    }

    /**
     * This method is for diagnostics and unit testing only: it exposes
     * the lengths of the internal segments. This is an implementation detail
     * that is subject to change and does not affect the exposed functionality.
     * @return the lengths of the segments
     */

    public List debugSegmentLengths() {
        List result = new ArrayList<>(segments.size());
        for (UnicodeString str : segments) {
            result.add(str.length());
        }
        return result;
    }

    // Diagnostic method
    private void showSegmentLengths() {
        StringBuilder sb = new StringBuilder();
        for (UnicodeString str : segments) {
            sb.append(str.length() + ", ");
        }
        System.err.println(sb);
    }

    private void verifySegmentLengths() {
        long total = 0;
        for (int i = 0; i= 0 && start < z.length()) {
//            long next = start + 1000;
//            if (next > z.length()) {
//                break;
//            }
//            result = result.concat(z.substring(start, next)).concat(StringConstants.ASTERISK);
//            start = next+1;
//            occurrences++;
//        }
//        System.err.println("Replacements: " + occurrences);
//        result.showSegmentLengths();
//    }
//
//    private static void alphabet() {
//        String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
//        alphabet = alphabet + alphabet + alphabet + alphabet;
//        ZenoString z = new ZenoString();
//        for (int i=0; i

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api