All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ttzero.excel.reader.SharedStrings Maven / Gradle / Ivy

/*
 * Copyright (c) 2017-2018, [email protected] All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.ttzero.excel.reader;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import static java.lang.Character.highSurrogate;
import static java.lang.Character.isBmpCodePoint;
import static java.lang.Character.isValidCodePoint;
import static java.lang.Character.lowSurrogate;
import static java.lang.Integer.numberOfTrailingZeros;
import static org.ttzero.excel.manager.Const.Limit.MAX_CHARACTERS_PER_CELL;
import static org.ttzero.excel.util.ExtBufferedWriter.MALFORMED_CHAR;
import static org.ttzero.excel.util.StringUtil.EMPTY;

/**
 * Read sharedString data
 * 

* This record contains a list of all strings used anywhere in the workbook. * Each string occurs only once. The workbook uses indexes into the list to reference the strings * * @author guanquan.wang at 2018-09-27 14:28 */ public class SharedStrings implements Closeable { private final Logger LOGGER = LoggerFactory.getLogger(getClass()); /** * The maximum capacity, used if a higher value is implicitly specified * by either of the constructors with arguments. * MUST be a power of two <= 1<<36. */ static final int MAXIMUM_CAPACITY = 1 << 20; /** * Constructs a SharedStrings containing the elements of the * specified data array * * @param data the shared strings */ public SharedStrings(String[] data) { max = data.length; offset_forward = 0; status = 1; if (max <= 512) { forward = new String[max]; System.arraycopy(data, offset_forward, forward, 0, max); limit_forward = max; } else { page = (max + 1) >> 1; status <<= 1; forward = new String[page]; limit_forward = page; System.arraycopy(data, offset_forward, forward, 0, limit_forward); offset_backward = page; limit_backward = max - page; backward = new String[limit_backward]; System.arraycopy(data, offset_backward, backward, 0, limit_backward); } } /** * Constructs a SharedString with the xml path, please call * {@link SharedStrings#load()} after instance * * @param is the xml file path * @param cacheSize the number of word per load * @param hotSize the number of high frequency word */ public SharedStrings(InputStream is, int cacheSize, int hotSize) { this.reader = new InputStreamReader(is, StandardCharsets.UTF_8); if (cacheSize > 0) { this.page = tableSizeFor(cacheSize); } this.hotSize = hotSize; } /** * Constructs a SharedStrings with a {@link IndexSharedStringTable} * * @param sst {@link IndexSharedStringTable} * @param cacheSize the number of word per load * @param hotSize the number of high frequency word * @throws IOException if I/O error occur. */ public SharedStrings(IndexSharedStringTable sst, int cacheSize, int hotSize) throws IOException { this.sst = sst; max = sst.size(); if (cacheSize > 0) { this.page = tableSizeFor(cacheSize); } this.hotSize = hotSize; init(); // Load forward limit_forward = sst.get(offset_forward = 0, forward); } /** * Storage the new load data */ private String[] forward; /** * Copy data to this area when the forward area is missing */ private String[] backward; /** * Number of word per load */ private int page; /** * The word total */ private int max = -1, offsetM = 0; /** * The forward offset */ private int offset_forward = -1; /** * The backward offset */ private int offset_backward = -1; /** * The forward limit */ private int limit_forward; /** * The backward limit */ private int limit_backward; /** * A tester of SharedString's cache */ private Tester tester = null; /** * High frequency word */ private Cache hot; /** * Size of hot */ private int hotSize; /** * Main reader */ private Reader reader; /** * Buffered */ private char[] cb; /** * length of cb[] */ private int nChar, length; /** * Shared string table */ private IndexSharedStringTable sst; /** * 0: empty * 1: forward only * 2: forward + backward * 4: large model/unknown size */ private int status; /** * Buffer */ StringBuilder buf = null; // For debug private int total, total_forward, total_backward, total_hot, total_sst; /** * @return the shared string unique count * -1 if unknown size */ public int size() { return max; } /** * Returns a power of two size for the given target capacity. * * @param cap the custom buffer size * @return Returns a power of two size */ public static int tableSizeFor(int cap) { int n = cap - 1; n |= n >>> 1; n |= n >>> 2; n |= n >>> 4; n |= n >>> 8; n |= n >>> 16; return (n < 64) ? 64 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; } /** * Load the sharedString.xml file and instance word cache * * @return the {@code SharedStrings} * @throws IOException if io error occur */ public SharedStrings load() throws IOException { // Get unique count max = uniqueCount(); LOGGER.debug("Size of SharedString: {}", max); // init(); return this; } /* */ private void init() throws IOException { status = 1; // Unknown size or greater than {@code 8192} if (max < 0 || max > 1 << 14) { if (page <= 0) page = 16; status <<= 2; forward = new String[page]; backward = new String[page]; // Cache 8KB binary, it will store 1^16 strings. tester = new Tester.BinaryTester(max > 0 ? Math.min(max, 1 << 16) : 1 << 16); if (hotSize > 0) hot = FixSizeLRUCache.create(hotSize); else hot = FixSizeLRUCache.create(); // Instance the SharedStringTable if (sst == null) { sst = new IndexSharedStringTable(); sst.setShortSectorSize(numberOfTrailingZeros(page)); } } else if (max > 512) { status <<= 1; page = (max + 1) >> 1; forward = new String[page]; backward = new String[page]; } else { forward = new String[page = Math.max(16, max)]; } } /** * Getting the unique strings count in SharedStringTable * * @return the unique strings count * @throws IOException if I/O error occur */ private int uniqueCount() throws IOException { int off = -1; cb = new char[1 << 12]; length = reader.read(cb); // Empty Shared String Table if (length <= 0) return status = 0; String line = new String(cb, 0, Math.min(256, length)); // Microsoft Excel String uniqueCount = " uniqueCount="; int index = line.indexOf(uniqueCount) , end = index > 0 ? line.indexOf('"', index += (uniqueCount.length() + 1)) : -1; if (end > 0) { off = Integer.parseInt(line.substring(index, end)); } // WPS else { String count = " count="; index = line.indexOf(count); end = index > 0 ? line.indexOf('"', index += (count.length() + 1)) : -1; if (end > 0) { off = Integer.parseInt(line.substring(index, end)); } } if (end > 0) nChar = end + 1; return off; } /** * Getting the strings value by index * * @param index the index of SharedStringTable * @return string */ public String get(int index) { // checkBound(index); total++; // Load first if (offset_forward == -1) { offset_forward = index / page * page; readMore(); } String value = null; // Find in forward if (forwardRange(index)) { value = forward[index - offset_forward]; total_forward++; if (test(index)) hot.put(index, value); return value; } if (status == 1) throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + max); // Find in backward if (backwardRange(index)) { value = backward[index - offset_backward]; total_backward++; if (test(index)) hot.put(index, value); return value; } // Find in hot cache if (status == 4) { value = hot.get(index); } // Can't find in memory cache if (value == null) { if (status == 2 && offset_backward > -1) throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + max); copyToBackward(); // reload data offset_forward = index / page * page; forward[0] = null; if (status == 4 && index < sst.size()) { try { // Load from SharedStringTable limit_forward = sst.get(offset_forward, forward); } catch (IOException e) { throw new ExcelReadException(e); } total_sst++; } else { readMore(); total_forward++; } if (forward[0] == null) { throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + max); } value = forward[index - offset_forward]; if (test(index)) hot.put(index, value); } else { total_hot++; } return value; } // Check the forward range private boolean forwardRange(int index) { return offset_forward >= 0 && offset_forward <= index && offset_forward + limit_forward > index; } // Check the backward range private boolean backwardRange(int index) { return offset_backward >= 0 && offset_backward <= index && offset_backward + limit_backward > index; } // // Check the current index if out of bound // private void checkBound(int index) { // if (index < 0 || max > -1 && max <= index) { // throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + max); // } // } // Check the current index has been loaded twice private boolean test(int index) { return status == 4 && tester.test(index); } private void copyToBackward() { String[] tmp = backward; backward = forward; forward = tmp; offset_backward = offset_forward; limit_backward = limit_forward; } /** * Load string record from xml */ protected void readMore() { int index = offset_forward / page; try { // Read xml file string value into IndexSharedStringTable for (int n = index - offsetM; n-- >= 0; ) { if (offset_backward == -1 && limit_forward > 0) { copyToBackward(); offset_backward = 0; } readData(); } } catch (IOException e) { throw new ExcelReadException(e); } } /** * Read data from main reader * forward only * * @return the word count * @throws IOException if I/O error occur */ protected int readData() throws IOException { // Read forward area data int n = 0, len = length, offset; for (; ;) { int len0 = len - 3, len1 = len0 - 1; int[] t = findT(cb, nChar, len, len0, len1, n); nChar = t[0]; limit_forward = n = t[1]; // A page Or EOF if (n == page || len < cb.length && nChar == len - 6) { ++offsetM; // out of index range break; } // If cell value(character value) length greater than buffer size if (nChar == 0) { cb = Arrays.copyOf(cb, cb.length << 1); offset = len; } else if (nChar < len) System.arraycopy(cb, nChar, cb, 0, offset = len - nChar); else offset = 0; // Read more if ((len = reader.read(cb, offset, cb.length - offset)) <= 0) break; len += offset; nChar = 0; } // Reset totals when unknown size if (max < n) { max = offsetM * page + n; } return n; // Returns the word count } // [0]: nChar // [1]: number of string private int[] findT(char[] cb, int nChar, int length, int len0, int len1, int n) throws IOException { int cursor; for (; nChar < length && n < page; ) { cursor = nChar; // find the tag `` or tag `` for (; nChar < len0 && cb[nChar] != '<'; ++nChar) ; // Empty if (nChar < len0 && cb[nChar + 1] == 's' && cb[nChar + 2] == 'i' && (cb[nChar + 3] == '>' || cb[nChar + 3] == '/' && cb[nChar + 4] == '>')) { if (cb[nChar + 3] == '/') { forward[n++] = EMPTY; if (status == 4) sst.push(forward[n - 1]); nChar += 5; continue; } else nChar += 4; } int[] subT = subT(cb, nChar, len0, len1); int a = subT[0]; if (a == -1) break; nChar = subT[1]; String tmp = escape(cb, a, nChar); // Skip the end tag of 't' nChar += 4; // Test the next tag if (cb[nChar] != '<') for (; nChar < len1 && (cb[nChar] != '<'); ++nChar); // End of if (nChar < len1 && cb[nChar + 1] == '/' && cb[nChar + 2] == 's' && cb[nChar + 3] == 'i' && cb[nChar + 4] == '>') { forward[n++] = tmp; if (status == 4) sst.push(forward[n - 1]); nChar += 5; } else { int t = nChar; // Find the end tag of 'si' for (; nChar < len1 && (cb[nChar] != '<' || cb[nChar + 1] != '/' || cb[nChar + 2] != 's' || cb[nChar + 3] != 'i' || cb[nChar + 4] != '>'); ++nChar); if (nChar >= len1) { nChar = cursor; break; } int end = nChar; nChar = t; boolean shouldClear = true; // Loop and join for (; ; ) { subT = subT(cb, nChar, end, end - 1); a = subT[0]; if (a == -1) break; nChar = subT[1]; if (buf == null) { shouldClear = false; buf = new StringBuilder(tmp); } else if (shouldClear) { shouldClear = false; buf.delete(0, buf.length()); buf.append(tmp); } buf.append(escape(cb, a, nChar)); nChar += 4; } forward[n++] = shouldClear ? tmp : buf.toString(); if (status == 4) sst.push(forward[n - 1]); nChar = end + 5; } // An integral page records if (n == page) break; } // DEBUG the last character // LOGGER.info("---------{}---------", new String(cb, nChar, length - nChar)); return new int[] { nChar, n }; } // Returns the index round of private int[] subT(char[] cb, int nChar, int len0, int len1) { do { // The next tag for (; nChar < len0 && cb[nChar] != '<'; ++nChar) ; if (nChar >= len1) return new int[] { -1 }; // Ignore translate if (cb[nChar + 1] == 'r' && cb[nChar + 2] == 'P' && cb[nChar + 3] == 'h' && (cb[nChar + 4] == '>' || cb[nChar + 4] == ' ')) { int a = nChar + 5; for (int len = len1 - 2; a < len && cb[a] != '<' || cb[a + 1] != '/' || cb[a + 2] != 'r' || cb[a + 3] != 'P' || cb[a + 4] != 'h' || cb[a + 5] != '>'; ++a) ; if (a >= len1 - 2) return new int[] { -1 }; nChar = a + 6; } else break; } while (nChar < len1); // Empty si if (nChar < len1 && cb[nChar + 1] == '/' && cb[nChar + 2] == 's' && cb[nChar + 3] == 'i' && cb[nChar + 4] == '>') { // It will skip the tag, so here you need to go back 4 characters in reverse return new int[] { nChar - 4, nChar - 4 }; } for (; nChar < len0 && (cb[nChar] != '<' || cb[nChar + 1] != 't' || cb[nChar + 2] != '>' && cb[nChar + 2] != ' ' && cb[nChar + 2] != '/'); ++nChar) ; if (nChar >= len0) return new int[] { -1 }; // Not found // Empty tag if (cb[nChar + 2] == '/' && cb[nChar + 3] == '>') return new int[] { nChar, nChar }; int a = nChar += 3; if (cb[nChar - 1] == ' ') { // space="preserve" for (; nChar < len0 && cb[nChar++] != '>'; ) ; if (nChar >= len0) return new int[] { -1 }; // Not found a = nChar; } for (; nChar < len1 && (cb[nChar] != '<' || cb[nChar + 1] != '/' || cb[nChar + 2] != 't' || cb[nChar + 3] != '>'); ++nChar) ; if (nChar >= len1) return new int[] { -1 }; // Not found return new int[] { a, nChar }; } // Buffer cache (Maximum 64K) private static char[] charBuffer = {}; /** * escape * * @param cb source char buffer * @param from starting position in the source array. * @param to ending position in the source array. * @return Escape xml string */ public static String escape(char[] cb, int from, int to) { int n = to - from; if (n == 0) return EMPTY; int idx_38 = indexOf(cb, '&', from, to) , idx_59 = idx_38 > -1 && idx_38 < to ? indexOf(cb, ';', idx_38 + 1, Math.min(idx_38 + 9, to)) : -1; if (idx_38 < from || idx_38 >= idx_59 || idx_59 > to) return new String(cb, from, to - from); char[] buf; if (n <= charBuffer.length) buf = charBuffer; else if (n <= MAX_CHARACTERS_PER_CELL) charBuffer = buf = new char[Math.min(n + 100, MAX_CHARACTERS_PER_CELL)]; else buf = new char[n]; int offset = 0; do { System.arraycopy(cb, from, buf, offset, n = idx_38 - from); offset += n; // ASCII if (cb[idx_38 + 1] == '#') { char c = cb[idx_38 + 2]; if (c == 'x') offset += toChars(toIntH(cb, idx_38 + 3, idx_59), buf, offset); else if (c >= '0' && c <= '9') offset += toChars(toInt(cb, idx_38 + 2, idx_59), buf, offset); else { System.arraycopy(cb, idx_38, buf, offset, n = idx_59 - idx_38 + 1); offset += n; } } // desc else { n = idx_59 - idx_38 - 1; if (n == 2 && cb[idx_38 + 1] == 'l' && cb[idx_38 + 2] == 't') buf[offset++] = '<'; else if (n == 2 && cb[idx_38 + 1] == 'g' && cb[idx_38 + 2] == 't') buf[offset++] = '>'; else if (n == 3 && cb[idx_38 + 1] == 'a' && cb[idx_38 + 2] == 'm' && cb[idx_38 + 3] == 'p') buf[offset++] = '&'; else if (n == 4 && cb[idx_38 + 1] == 'n' && cb[idx_38 + 2] == 'b' && cb[idx_38 + 3] == 's' && cb[idx_38 + 4] == 'p') buf[offset++] = ' '; else if (n == 4 && cb[idx_38 + 1] == 'q' && cb[idx_38 + 2] == 'u' && cb[idx_38 + 3] == 'o' && cb[idx_38 + 4] == 't') buf[offset++] = '"'; else { System.arraycopy(cb, idx_38, buf, offset, n = idx_59 - idx_38 + 1); offset += n; } } from = ++idx_59; idx_59 = (idx_38 = indexOf(cb, '&', idx_59, to)) > -1 && idx_38 < to ? indexOf(cb, ';', idx_38 + 1, Math.min(idx_38 + 9, to)) : -1; } while (idx_38 > -1 && idx_59 > idx_38 && idx_59 <= to); if (from < to) { System.arraycopy(cb, from, buf, offset, n = to - from); offset += n; } return new String(buf, 0, offset); } private static int indexOf(char[] cb, char c, int from, int to) { for (; from < to && cb[from] != c; from++); return from < to ? from : -1; } static int toInt(char[] cb, int a, int b) { int n = 0; boolean negative = cb[a] == '-'; for (int i = negative ? a + 1 : a; b > i; n = n * 10 + cb[i++] - '0'); return negative ? -n : n; } // Hex value static int toIntH(char[] cb, int a, int b) { int n = 0; for (int c; b > a; n = n * 16 + ((c = cb[a++]) <= '9' ? c - '0' : (c >= 'a' ? c - 32 : c) - '7')); return n; } static int toChars(int codePoint, char[] dst, int i) { int n; if (isBmpCodePoint(codePoint)) { dst[i] = (char) codePoint; n = 1; } else if (isValidCodePoint(codePoint)) { dst[i + 1] = lowSurrogate(codePoint); dst[i] = highSurrogate(codePoint); n = 2; } else { dst[i] = MALFORMED_CHAR; // Illegal value � n = 1; } return n; } /** * close stream and free space */ @Override public void close() throws IOException { if (reader != null) { // Debug hit rate LOGGER.debug("Count: {}, uniqueCount: {}, Repetition rate: {}%", total, max , (total > 0 ? (total - max) * 100.0 / total : 0)); LOGGER.debug("Forward: {}, Backward: {}, SST: {}, Hot: {}, Tester: {Resize: {}, Size: {}}" , total_forward, total_backward, total_sst, total_hot , tester != null ? tester.analysis() : 0, tester != null ? tester.size() : 0); reader.close(); } cb = null; forward = null; backward = null; if (tester != null) { tester = null; } if (sst != null) { sst.close(); } } @Override public String toString() { return "Count: " + (total <= 0 ? max : total) + ",UniqueCount: " + max; } } interface Tester { /** * Test if a string needs to be cached * * @param i the string index in {@link IndexSharedStringTable} * @return true if the string should be cached */ boolean test(int i); /** * Returns the limit index of {@link Tester} * * @return limit index */ int limit(); /** * Returns the block size of {@link Tester} * * @return the mark array length */ int size(); int analysis(); class BinaryTester implements Tester { private int start; private int limit; private final int initial_size; private final long[] marks; // private static final int LIMIT = (1 << 25) - 1; private int total_resize; // For debug BinaryTester(int expectedInsertions) { marks = new long[initial_size = ((expectedInsertions - 1) >> 6) + 1]; limit = (initial_size << 6) - 1; } @Override public boolean test(int i) { if (i < start) return true; // Check bound of bit-set if (i > limit && !resize(i)) return false; i = i - start; int n = i >> 6, m = i - (n << 6); boolean a = ((marks[n] >> (63 - m)) & 1) == 1; marks[n] |= 1L << (63 - m); return a; } @Override public int limit() { return limit; } @Override public int size() { return marks.length; } @Override public int analysis() { return total_resize; } private boolean resize(int i) { total_resize++; int ii = 0, n = marks.length, l = ((i - start) >> 6) + 1; for (; ii < n && marks[ii] == -1; ii++) ; if (l - ii > initial_size) for (; ii < n && (Long.bitCount(marks[ii]) > 48 || marks[ii] == 0); ii++) ; if (l - ii <= initial_size) { // Clean old mark int j = n - ii; if (j > 0) System.arraycopy(marks, ii, marks, 0, j); for (; j < n; marks[j++] = 0L) ; start += (ii << 6); limit = (marks.length << 6) + start - 1; return true; } return false; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy