All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.designer.DefaultSampleDocumentsLoader Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.designer;

import static org.apache.solr.common.params.CommonParams.JSON_MIME;
import static org.apache.solr.handler.loader.CSVLoaderBase.SEPARATOR;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.handler.loader.CSVLoaderBase;
import org.apache.solr.handler.loader.JsonLoader;
import org.apache.solr.handler.loader.XMLLoader;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.SafeXMLParsing;
import org.noggit.JSONParser;
import org.noggit.ObjectBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.SAXException;

public class DefaultSampleDocumentsLoader implements SampleDocumentsLoader {
  public static final String CSV_MULTI_VALUE_DELIM_PARAM = "csvMultiValueDelimiter";
  private static final int MAX_STREAM_SIZE = (5 * 1024 * 1024);
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  public static byte[] streamAsBytes(final InputStream in) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    byte[] buf = new byte[1024];
    int r;
    try (in) {
      while ((r = in.read(buf)) != -1) baos.write(buf, 0, r);
    }
    return baos.toByteArray();
  }

  @Override
  public SampleDocuments parseDocsFromStream(
      SolrParams params, ContentStream stream, final int maxDocsToLoad) throws IOException {
    final String contentType = stream.getContentType();
    if (contentType == null) {
      return SampleDocuments.NONE;
    }

    if (params == null) {
      params = new ModifiableSolrParams();
    }

    Long streamSize = stream.getSize();
    if (streamSize != null && streamSize > MAX_STREAM_SIZE) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Sample is too big! "
              + MAX_STREAM_SIZE
              + " bytes is the max upload size for sample documents.");
    }

    String fileSource = "paste";
    if ("file".equals(stream.getName())) {
      fileSource = stream.getSourceInfo() != null ? stream.getSourceInfo() : "file";
    }

    byte[] uploadedBytes = streamAsBytes(stream.getStream());
    // recheck the upload size in case the stream returned null for getSize
    if (uploadedBytes.length > MAX_STREAM_SIZE) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Sample is too big! "
              + MAX_STREAM_SIZE
              + " bytes is the max upload size for sample documents.");
    }
    // use a byte stream for the parsers in case they need to re-parse using a different strategy
    // e.g. JSON vs. JSON lines or different CSV strategies ...
    ContentStreamBase.ByteArrayStream byteStream =
        new ContentStreamBase.ByteArrayStream(uploadedBytes, fileSource, contentType);
    String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
    if (charset == null) {
      charset = ContentStreamBase.DEFAULT_CHARSET;
    }

    List docs = null;
    if (stream.getSize() > 0) {
      if (contentType.contains(JSON_MIME)) {
        docs = loadJsonDocs(params, byteStream, maxDocsToLoad);
      } else if (contentType.contains("text/xml") || contentType.contains("application/xml")) {
        docs = loadXmlDocs(params, byteStream, maxDocsToLoad);
      } else if (contentType.contains("text/csv") || contentType.contains("application/csv")) {
        docs = loadCsvDocs(params, fileSource, uploadedBytes, charset, maxDocsToLoad);
      } else if (contentType.contains("text/plain")
          || contentType.contains("application/octet-stream")) {
        docs = loadJsonLines(params, byteStream, maxDocsToLoad);
      } else {
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST, contentType + " not supported yet!");
      }

      if (docs != null && maxDocsToLoad > 0 && docs.size() > maxDocsToLoad) {
        docs = docs.subList(0, maxDocsToLoad);
      }
    }

    return new SampleDocuments(docs, contentType, fileSource);
  }

  protected List loadCsvDocs(
      SolrParams params, String source, byte[] streamBytes, String charset, final int maxDocsToLoad)
      throws IOException {
    ContentStream stream;
    if (params.get(SEPARATOR) == null) {
      String csvStr = new String(streamBytes, charset);
      char sep = detectTSV(csvStr);
      ModifiableSolrParams modifiableSolrParams = new ModifiableSolrParams(params);
      modifiableSolrParams.set(SEPARATOR, String.valueOf(sep));
      params = modifiableSolrParams;
      stream = new ContentStreamBase.StringStream(csvStr, "text/csv");
    } else {
      stream = new ContentStreamBase.ByteArrayStream(streamBytes, source, "text/csv");
    }
    return (new SampleCSVLoader(new CSVRequest(params), maxDocsToLoad)).loadDocs(stream);
  }

  @SuppressWarnings("unchecked")
  protected List loadJsonLines(
      SolrParams params, ContentStreamBase.ByteArrayStream stream, final int maxDocsToLoad)
      throws IOException {
    List> docs = new ArrayList<>();
    try (Reader r = stream.getReader()) {
      BufferedReader br = new BufferedReader(r);
      String line;
      while ((line = br.readLine()) != null) {
        line = line.trim();
        if (!line.isEmpty() && line.startsWith("{") && line.endsWith("}")) {
          Object jsonLine = ObjectBuilder.getVal(new JSONParser(line));
          if (jsonLine instanceof Map) {
            docs.add((Map) jsonLine);
          }
        }
        if (maxDocsToLoad > 0 && docs.size() == maxDocsToLoad) {
          break;
        }
      }
    }

    return docs.stream().map(JsonLoader::buildDoc).collect(Collectors.toList());
  }

  @SuppressWarnings("unchecked")
  protected List loadJsonDocs(
      SolrParams params, ContentStreamBase.ByteArrayStream stream, final int maxDocsToLoad)
      throws IOException {
    Object json;
    try (Reader r = stream.getReader()) {
      json = ObjectBuilder.getVal(new JSONParser(r));
    }
    if (json == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "Expected at least 1 JSON doc in the request body!");
    }

    List> docs;
    if (json instanceof List) {
      // list of docs
      docs = (List>) json;
    } else if (json instanceof Map) {
      // single doc ... see if this is a json lines file
      boolean isJsonLines = false;
      String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
      String jsonStr =
          new String(
              streamAsBytes(stream.getStream()),
              charset != null ? charset : ContentStreamBase.DEFAULT_CHARSET);
      String[] lines = jsonStr.split("\n");
      if (lines.length > 1) {
        for (String line : lines) {
          line = line.trim();
          if (!line.isEmpty() && line.startsWith("{") && line.endsWith("}")) {
            isJsonLines = true;
            break;
          }
        }
      }
      if (isJsonLines) {
        docs = loadJsonLines(lines);
      } else {
        docs = Collections.singletonList((Map) json);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Expected one or more JSON docs in the request body!");
    }
    if (maxDocsToLoad > 0 && docs.size() > maxDocsToLoad) {
      docs = docs.subList(0, maxDocsToLoad);
    }
    return docs.stream().map(JsonLoader::buildDoc).collect(Collectors.toList());
  }

  protected List loadXmlDocs(
      SolrParams params, ContentStreamBase.ByteArrayStream stream, final int maxDocsToLoad)
      throws IOException {
    String xmlString = readInputAsString(stream.getStream()).trim();
    List docs;
    if (xmlString.contains("") && xmlString.contains("")) {
      XMLInputFactory inputFactory = XMLInputFactory.newInstance();
      inputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
      inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
      XMLStreamReader parser = null;
      try {
        parser = inputFactory.createXMLStreamReader(new StringReader(xmlString));
        docs = parseXmlDocs(parser, maxDocsToLoad);
      } catch (XMLStreamException e) {
        throw new IOException(e);
      } finally {
        if (parser != null) {
          try {
            parser.close();
          } catch (XMLStreamException ignore) {
          }
        }
      }
    } else {
      Document xmlDoc;
      try {
        xmlDoc = SafeXMLParsing.parseUntrustedXML(log, xmlString);
      } catch (SAXException e) {
        throw new IOException(e);
      }
      Element root = xmlDoc.getDocumentElement();
      // TODO: support other types of XML here
      throw new IOException(
          "TODO: XML documents with root " + root.getTagName() + " not supported yet!");
    }
    return docs;
  }

  protected List parseXmlDocs(XMLStreamReader parser, final int maxDocsToLoad)
      throws XMLStreamException {
    List docs = new ArrayList<>();
    XMLLoader loader = new XMLLoader().init(null);
    while (true) {
      final int event;
      try {
        event = parser.next();
      } catch (java.util.NoSuchElementException noSuchElementException) {
        return docs;
      }
      switch (event) {
        case XMLStreamConstants.END_DOCUMENT:
          parser.close();
          return docs;
        case XMLStreamConstants.START_ELEMENT:
          if ("doc".equals(parser.getLocalName())) {
            SolrInputDocument doc = loader.readDoc(parser);
            if (doc != null) {
              docs.add(doc);

              if (maxDocsToLoad > 0 && docs.size() >= maxDocsToLoad) {
                parser.close();
                return docs;
              }
            }
          }
      }
    }
  }

  @SuppressWarnings("unchecked")
  protected List> loadJsonLines(String[] lines) throws IOException {
    List> docs = new ArrayList<>(lines.length);
    for (String line : lines) {
      line = line.trim();
      if (!line.isEmpty() && line.startsWith("{") && line.endsWith("}")) {
        Object jsonLine = ObjectBuilder.getVal(new JSONParser(line));
        if (jsonLine instanceof Map) {
          docs.add((Map) jsonLine);
        }
      }
    }
    return docs;
  }

  protected String readInputAsString(InputStream in) throws IOException {
    return new String(streamAsBytes(in), StandardCharsets.UTF_8);
  }

  protected char detectTSV(String csvStr) {
    char sep = ',';
    int endOfFirstLine = csvStr.indexOf('\n');
    if (endOfFirstLine != -1) {
      int commas = 0;
      int tabs = 0;
      for (char value : csvStr.substring(0, endOfFirstLine).toCharArray()) {
        if (value == ',') {
          ++commas;
        } else if (value == '\t') {
          ++tabs;
        }
      }
      if (tabs >= commas) {
        sep = '\t';
      }
    }
    return sep;
  }

  private static class NoOpUpdateRequestProcessor extends UpdateRequestProcessor {
    NoOpUpdateRequestProcessor() {
      super(null);
    }
  }

  private static class CSVRequest extends SolrQueryRequestBase {
    CSVRequest(SolrParams params) {
      super(null, params);
    }
  }

  private static class SampleCSVLoader extends CSVLoaderBase {
    List docs = new ArrayList<>();
    CSVRequest req;
    int maxDocsToLoad;
    String multiValueDelimiter;

    SampleCSVLoader(CSVRequest req, int maxDocsToLoad) {
      super(req, new NoOpUpdateRequestProcessor());
      this.req = req;
      this.maxDocsToLoad = maxDocsToLoad;
      this.multiValueDelimiter = req.getParams().get(CSV_MULTI_VALUE_DELIM_PARAM);
    }

    List loadDocs(ContentStream stream) throws IOException {
      load(req, new SolrQueryResponse(), stream, processor);
      return docs;
    }

    @Override
    public void addDoc(int line, String[] vals) throws IOException {
      if (maxDocsToLoad > 0 && docs.size() >= maxDocsToLoad) {
        return; // just a short circuit, probably doesn't help that much
      }

      templateAdd.clear();
      SolrInputDocument doc = new SolrInputDocument();
      doAdd(line, vals, doc, templateAdd);
      if (templateAdd.solrDoc != null) {
        if (multiValueDelimiter != null) {
          for (SolrInputField field : templateAdd.solrDoc.values()) {
            if (field.getValueCount() == 1) {
              Object value = field.getFirstValue();
              if (value instanceof String) {
                String[] splitValue = ((String) value).split(multiValueDelimiter);
                if (splitValue.length > 1) {
                  field.setValue(Arrays.asList(splitValue));
                }
              }
            }
          }
        }
        docs.add(templateAdd.solrDoc);
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy