All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.search.stats.StatsUtil Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.stats;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;

import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.lucene.index.Term;
import org.apache.solr.common.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Various utilities for de/serialization of term stats and collection stats.
 * 

TODO: serialization format is very simple and does nothing to compress the data.

*/ public class StatsUtil { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); public static final String ENTRY_SEPARATOR = "!"; public static final char ENTRY_SEPARATOR_CHAR = '!'; /** * Parse a list of urls separated by "|" in order to retrieve a shard name. * @param collectionName collection name * @param shardUrls list of urls * @return shard name, or shardUrl if no shard info is present, * or null if impossible to determine (eg. empty string) */ public static String shardUrlToShard(String collectionName, String shardUrls) { // we may get multiple replica urls String[] urls = shardUrls.split("\\|"); if (urls.length == 0) { return null; } String[] urlParts = urls[0].split("/"); String coreName = urlParts[urlParts.length - 1]; String replicaName = Utils.parseMetricsReplicaName(collectionName, coreName); String shard; if (replicaName != null) { shard = coreName.substring(collectionName.length() + 1); shard = shard.substring(0, shard.length() - replicaName.length() - 1); } else { if (coreName.length() > collectionName.length() && coreName.startsWith(collectionName)) { shard = coreName.substring(collectionName.length() + 1); if (shard.isEmpty()) { shard = urls[0]; } } else { shard = urls[0]; } } return shard; } public static String termsToEncodedString(Collection terms) { StringBuilder sb = new StringBuilder(); for (Object o : terms) { if (sb.length() > 0) { sb.append(ENTRY_SEPARATOR); } if (o instanceof Term) { sb.append(termToEncodedString((Term) o)); } else { sb.append(termToEncodedString(String.valueOf(o))); } } return sb.toString(); } public static Set termsFromEncodedString(String data) { Set terms = new HashSet<>(); if (data == null || data.trim().isEmpty()) { return terms; } String[] items = data.split(ENTRY_SEPARATOR); for (String item : items) { Term t = termFromEncodedString(item); if (t != null) { terms.add(t); } } return terms; } public static Set fieldsFromString(String data) { Set fields = new HashSet<>(); if (data == null || data.trim().isEmpty()) { return fields; } String[] items = data.split(ENTRY_SEPARATOR); for (String item : items) { if (!item.trim().isEmpty()) { fields.add(item); } } return fields; } public static String fieldsToString(Collection fields) { StringBuilder sb = new StringBuilder(); for (String field : fields) { if (field.trim().isEmpty()) { continue; } if (sb.length() > 0) { sb.append(ENTRY_SEPARATOR); } sb.append(field); } return sb.toString(); } /** * Make a String representation of {@link CollectionStats} */ public static String colStatsToString(CollectionStats colStats) { StringBuilder sb = new StringBuilder(); sb.append(colStats.field); sb.append(','); sb.append(colStats.maxDoc); sb.append(','); sb.append(colStats.docCount); sb.append(','); sb.append(colStats.sumTotalTermFreq); sb.append(','); sb.append(colStats.sumDocFreq); return sb.toString(); } private static CollectionStats colStatsFromString(String data) { if (data == null || data.trim().length() == 0) { log.warn("Invalid empty collection stats string"); return null; } String[] vals = data.split(","); if (vals.length != 5) { log.warn("Invalid collection stats string, num fields " + vals.length + " != 5, '" + data + "'"); return null; } String field = vals[0]; try { long maxDoc = Long.parseLong(vals[1]); long docCount = Long.parseLong(vals[2]); long sumTotalTermFreq = Long.parseLong(vals[3]); long sumDocFreq = Long.parseLong(vals[4]); return new CollectionStats(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); } catch (Exception e) { log.warn("Invalid collection stats string '" + data + "': " + e.toString()); return null; } } public static String termToEncodedString(Term t) { StringBuilder sb = new StringBuilder(); sb.append(t.field()).append(':'); sb.append(encode(t.text())); return sb.toString(); } public static final char ESCAPE = '_'; public static final char ESCAPE_ENTRY_SEPARATOR = '0'; public static String encode(String value) { StringBuilder output = new StringBuilder(value.length() + 2); for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); switch (c) { case ESCAPE : output.append(ESCAPE).append(ESCAPE); break; case ENTRY_SEPARATOR_CHAR : output.append(ESCAPE).append(ESCAPE_ENTRY_SEPARATOR); break; default : output.append(c); } } try { return URLEncoder.encode(output.toString(), "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Apparently your JVM doesn't support UTF-8 encoding?", e); } } public static String decode(String value) throws IOException { value = URLDecoder.decode(value, "UTF-8"); StringBuilder output = new StringBuilder(value.length()); for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); // escaped char follows if (c == ESCAPE && i < value.length() - 1) { i++; char next = value.charAt(i); if (next == ESCAPE) { output.append(ESCAPE); } else if (next == ESCAPE_ENTRY_SEPARATOR) { output.append(ENTRY_SEPARATOR_CHAR); } else { throw new IOException("invalid escape sequence in " + value); } } else { output.append(c); } } return output.toString(); } public static String termToEncodedString(String term) { int idx = term.indexOf(':'); if (idx == -1) { log.warn("Invalid term data without ':': '" + term + "'"); return null; } String prefix = term.substring(0, idx + 1); String value = term.substring(idx + 1); return prefix + encode(value); } public static Term termFromEncodedString(String data) { if (data == null || data.trim().length() == 0) { log.warn("Invalid empty term value"); return null; } int idx = data.indexOf(':'); if (idx == -1) { log.warn("Invalid term data without ':': '" + data + "'"); return null; } String field = data.substring(0, idx); String value = data.substring(idx + 1); try { return new Term(field, decode(value)); } catch (Exception e) { log.warn("Invalid term value '" + value + "'"); return null; } } public static String termStatsToString(TermStats termStats, boolean encode) { StringBuilder sb = new StringBuilder(); sb.append(encode ? termToEncodedString(termStats.term) : termStats.term).append(','); sb.append(termStats.docFreq); sb.append(','); sb.append(termStats.totalTermFreq); return sb.toString(); } private static TermStats termStatsFromString(String data) { if (data == null || data.trim().length() == 0) { log.warn("Invalid empty term stats string"); return null; } String[] vals = data.split(","); if (vals.length < 3) { log.warn("Invalid term stats string, num fields " + vals.length + " < 3, '" + data + "'"); return null; } Term term = termFromEncodedString(vals[0]); try { long docFreq = Long.parseLong(vals[1]); long totalTermFreq = Long.parseLong(vals[2]); return new TermStats(term.toString(), docFreq, totalTermFreq); } catch (Exception e) { log.warn("Invalid termStats string '" + data + "'"); return null; } } public static Map colStatsMapFromString(String data) { if (data == null || data.trim().length() == 0) { return null; } Map map = new HashMap(); String[] entries = data.split(ENTRY_SEPARATOR); for (String es : entries) { CollectionStats stats = colStatsFromString(es); if (stats != null) { map.put(stats.field, stats); } } return map; } public static String colStatsMapToString(Map stats) { if (stats == null || stats.isEmpty()) { return ""; } StringBuilder sb = new StringBuilder(); for (Entry e : stats.entrySet()) { if (sb.length() > 0) { sb.append(ENTRY_SEPARATOR); } sb.append(colStatsToString(e.getValue())); } return sb.toString(); } public static Map termStatsMapFromString(String data) { if (data == null || data.trim().length() == 0) { return null; } Map map = new HashMap<>(); String[] entries = data.split(ENTRY_SEPARATOR); for (String es : entries) { TermStats termStats = termStatsFromString(es); if (termStats != null) { map.put(termStats.term, termStats); } } return map; } public static String termStatsMapToString(Map stats) { if (stats == null || stats.isEmpty()) { return ""; } StringBuilder sb = new StringBuilder(); for (Entry e : stats.entrySet()) { if (sb.length() > 0) { sb.append(ENTRY_SEPARATOR); } sb.append(termStatsToString(e.getValue(), true)); } return sb.toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy