All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.util.SimplePostTool Maven / Gradle / Ivy

There is a newer version: 9.6.1
Show newest version
package org.apache.solr.util;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.TimeZone;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/**
 * A simple utility class for posting raw updates to a Solr server, 
 * has a main method so it can be run on the command line.
 * View this not as a best-practice code example, but as a standalone 
 * example built with an explicit purpose of not having external
 * jar dependencies.
 */
public class SimplePostTool {
  private static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
  private static final String VERSION_OF_THIS_TOOL = "1.5";

  private static final String DEFAULT_COMMIT = "yes";
  private static final String DEFAULT_OPTIMIZE = "no";
  private static final String DEFAULT_OUT = "no";
  private static final String DEFAULT_AUTO = "no";
  private static final String DEFAULT_RECURSIVE = "0";
  private static final int DEFAULT_WEB_DELAY = 10;
  private static final int MAX_WEB_DEPTH = 10;
  private static final String DEFAULT_CONTENT_TYPE = "application/xml";
  private static final String DEFAULT_FILE_TYPES = "xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log"; 

  static final String DATA_MODE_FILES = "files";
  static final String DATA_MODE_ARGS = "args";
  static final String DATA_MODE_STDIN = "stdin";
  static final String DATA_MODE_WEB = "web";
  static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;

  // Input args
  boolean auto = false;
  int recursive = 0;
  int delay = 0;
  String fileTypes;
  URL solrUrl;
  OutputStream out = null;
  String type;
  String mode;
  boolean commit;
  boolean optimize;
  String[] args;

  private int currentDepth;

  static HashMap mimeMap;
  GlobFileFilter globFileFilter;
  // Backlog for crawling
  List> backlog = new ArrayList>();
  Set visited = new HashSet();
  
  static final Set DATA_MODES = new HashSet();
  static final String USAGE_STRING_SHORT =
      "Usage: java [SystemProperties] -jar post.jar [-h|-] [ [...]]";

  // Used in tests to avoid doing actual network traffic
  static boolean mockMode = false;
  static PageFetcher pageFetcher;

  static {
    DATA_MODES.add(DATA_MODE_FILES);
    DATA_MODES.add(DATA_MODE_ARGS);
    DATA_MODES.add(DATA_MODE_STDIN);
    DATA_MODES.add(DATA_MODE_WEB);
    
    mimeMap = new HashMap();
    mimeMap.put("xml", "text/xml");
    mimeMap.put("csv", "text/csv");
    mimeMap.put("json", "application/json");
    mimeMap.put("pdf", "application/pdf");
    mimeMap.put("rtf", "text/rtf");
    mimeMap.put("html", "text/html");
    mimeMap.put("htm", "text/html");
    mimeMap.put("doc", "application/msword");
    mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
    mimeMap.put("ppt", "application/vnd.ms-powerpoint");
    mimeMap.put("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
    mimeMap.put("xls", "application/vnd.ms-excel");
    mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
    mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
    mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
    mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
    mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
    mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
    mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
    mimeMap.put("txt", "text/plain");
    mimeMap.put("log", "text/plain");
  }
  
  /**
   * See usage() for valid command line usage
   * @param args the params on the command line
   */
  public static void main(String[] args) {
    info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
    if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
      usage();
    } else {
      final SimplePostTool t = parseArgsAndInit(args);
      t.execute();
    }
  }

  /**
   * After initialization, call execute to start the post job.
   * This method delegates to the correct mode method.
   */
  public void execute() {
    final long startTime = System.currentTimeMillis();
    if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
      doFilesMode();
    } else if(DATA_MODE_ARGS.equals(mode) && args.length > 0) {
      doArgsMode();
    } else if(DATA_MODE_WEB.equals(mode) && args.length > 0) {
      doWebMode();
    } else if(DATA_MODE_STDIN.equals(mode)) {
      doStdinMode();
    } else {
      usageShort();
      return;
    }
    
    if (commit)   commit();
    if (optimize) optimize();
    final long endTime = System.currentTimeMillis();
    displayTiming(endTime - startTime);
  }
  
  /**
   * Pretty prints the number of milliseconds taken to post the content to Solr
   * @param millis the time in milliseconds
   */
  private void displayTiming(long millis) {
    SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault());
    df.setTimeZone(TimeZone.getTimeZone("UTC"));
    System.out.println("Time spent: "+df.format(new Date(millis)));
  }

  /**
   * Parses incoming arguments and system params and initializes the tool
   * @param args the incoming cmd line args
   * @return an instance of SimplePostTool
   */
  protected static SimplePostTool parseArgsAndInit(String[] args) {
    String urlStr = null;
    try {
      // Parse args
      final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
      if (! DATA_MODES.contains(mode)) {
        fatal("System Property 'data' is not valid for this tool: " + mode);
      }
      String params = System.getProperty("params", "");
      urlStr = System.getProperty("url", DEFAULT_POST_URL);
      urlStr = SimplePostTool.appendParam(urlStr, params);
      URL url = new URL(urlStr);
      boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
      String type = System.getProperty("type");
      // Recursive
      int recursive = 0;
      String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
      try {
        recursive = Integer.parseInt(r);
      } catch(Exception e) {
        if (isOn(r))
          recursive = DATA_MODE_WEB.equals(mode)?1:999;
      }
      // Delay
      int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
      try {
        delay = Integer.parseInt(System.getProperty("delay", ""+delay));
      } catch(Exception e) { }
      OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
      String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
      boolean commit = isOn(System.getProperty("commit",DEFAULT_COMMIT));
      boolean optimize = isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE));
      
      return new SimplePostTool(mode, url, auto, type, recursive, delay, fileTypes, out, commit, optimize, args);
    } catch (MalformedURLException e) {
      fatal("System Property 'url' is not a valid URL: " + urlStr);
      return null;
    }
  }

  /**
   * Constructor which takes in all mandatory input for the tool to work.
   * Also see usage() for further explanation of the params.
   * @param mode whether to post files, web pages, params or stdin
   * @param url the Solr base Url to post to, should end with /update
   * @param auto if true, we'll guess type and add resourcename/url
   * @param type content-type of the data you are posting
   * @param recursive number of levels for file/web mode, or 0 if one file only
   * @param delay if recursive then delay will be the wait time between posts
   * @param fileTypes a comma separated list of file-name endings to accept for file/web
   * @param out an OutputStream to write output to, e.g. stdout to print to console
   * @param commit if true, will commit at end of posting
   * @param optimize if true, will optimize at end of posting
   * @param args a String[] of arguments, varies between modes
   */
  public SimplePostTool(String mode, URL url, boolean auto, String type,
      int recursive, int delay, String fileTypes, OutputStream out, 
      boolean commit, boolean optimize, String[] args) {
    this.mode = mode;
    this.solrUrl = url;
    this.auto = auto;
    this.type = type;
    this.recursive = recursive;
    this.delay = delay;
    this.fileTypes = fileTypes;
    this.globFileFilter = getFileFilterFromFileTypes(fileTypes);
    this.out = out;
    this.commit = commit;
    this.optimize = optimize;
    this.args = args;
    pageFetcher = new PageFetcher();
  }

  public SimplePostTool() {}
  
  //
  // Do some action depending on which mode we have
  //
  private void doFilesMode() {
    currentDepth = 0;
    // Skip posting files if special param "-" given  
    if (!args[0].equals("-")) {
      info("Posting files to base url " + solrUrl + (!auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
      if(auto)
        info("Entering auto mode. File endings considered are "+fileTypes);
      if(recursive > 0)
        info("Entering recursive mode, max depth="+recursive+", delay="+delay+"s"); 
      int numFilesPosted = postFiles(args, 0, out, type);
      info(numFilesPosted + " files indexed.");
    }
  }

  private void doArgsMode() {
    info("POSTing args to " + solrUrl + "..");
    for (String a : args) {
      postData(stringToStream(a), null, out, type, solrUrl);
    }
  }

  private int doWebMode() {
    reset();
    int numPagesPosted = 0;
    try {
      if(type != null) {
        fatal("Specifying content-type with \"-Ddata=web\" is not supported");
      }
      if (args[0].equals("-")) {
        // Skip posting url if special param "-" given  
        return 0;
      }
      // Set Extracting handler as default
      solrUrl = appendUrlPath(solrUrl, "/extract");
      
      info("Posting web pages to Solr url "+solrUrl);
      auto=true;
      info("Entering auto mode. Indexing pages with content-types corresponding to file endings "+fileTypes);
      if(recursive > 0) {
        if(recursive > MAX_WEB_DEPTH) {
          recursive = MAX_WEB_DEPTH;
          warn("Too large recursion depth for web mode, limiting to "+MAX_WEB_DEPTH+"...");
        }
        if(delay < DEFAULT_WEB_DELAY)
          warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
        info("Entering recursive mode, depth="+recursive+", delay="+delay+"s");
      }
      numPagesPosted = postWebPages(args, 0, out);
      info(numPagesPosted + " web pages indexed.");
    } catch(MalformedURLException e) {
      fatal("Wrong URL trying to append /extract to "+solrUrl);
    }
    return numPagesPosted;
  }

  private void doStdinMode() {
    info("POSTing stdin to " + solrUrl + "..");
    postData(System.in, null, out, type, solrUrl);    
  }

  private void reset() {
    fileTypes = DEFAULT_FILE_TYPES;
    globFileFilter = this.getFileFilterFromFileTypes(fileTypes);
    backlog = new ArrayList>();
    visited = new HashSet();
  }


  //
  // USAGE
  //
  private static void usageShort() {
    System.out.println(USAGE_STRING_SHORT+"\n"+
        "       Please invoke with -h option for extended usage help.");
  }

  private static void usage() {
    System.out.println
    (USAGE_STRING_SHORT+"\n\n" +
     "Supported System Properties and their defaults:\n"+
     "  -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
     "  -Dtype= (default=" + DEFAULT_CONTENT_TYPE + ")\n"+
     "  -Durl= (default=" + DEFAULT_POST_URL + ")\n"+
     "  -Dauto=yes|no (default=" + DEFAULT_AUTO + ")\n"+
     "  -Drecursive=yes|no| (default=" + DEFAULT_RECURSIVE + ")\n"+
     "  -Ddelay= (default=0 for files, 10 for web)\n"+
     "  -Dfiletypes=[,,...] (default=" + DEFAULT_FILE_TYPES + ")\n"+
     "  -Dparams=\"=[&=...]\" (values must be URL-encoded)\n"+
     "  -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n"+
     "  -Doptimize=yes|no (default=" + DEFAULT_OPTIMIZE + ")\n"+
     "  -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"+
     "This is a simple command line tool for POSTing raw data to a Solr\n"+
     "port.  Data can be read from files specified as commandline args,\n"+
     "URLs specified as args, as raw commandline arg strings or via STDIN.\n"+
     "Examples:\n"+
     "  java -jar post.jar *.xml\n"+
     "  java -Ddata=args  -jar post.jar '42'\n"+
     "  java -Ddata=stdin -jar post.jar < hd.xml\n"+
     "  java -Ddata=web -jar post.jar http://example.com/\n"+
     "  java -Dtype=text/csv -jar post.jar *.csv\n"+
     "  java -Dtype=application/json -jar post.jar *.json\n"+
     "  java -Durl=http://localhost:8983/solr/update/extract -Dparams=literal.id=a -Dtype=application/pdf -jar post.jar a.pdf\n"+
     "  java -Dauto -jar post.jar *\n"+
     "  java -Dauto -Drecursive -jar post.jar afolder\n"+
     "  java -Dauto -Dfiletypes=ppt,html -jar post.jar afolder\n"+
     "The options controlled by System Properties include the Solr\n"+
     "URL to POST to, the Content-Type of the data, whether a commit\n"+
     "or optimize should be executed, and whether the response should\n"+
     "be written to STDOUT. If auto=yes the tool will try to set type\n"+
     "and url automatically from file name. When posting rich documents\n"+
     "the file name will be propagated as \"resource.name\" and also used\n"+
     "as \"literal.id\". You may override these or any other request parameter\n"+
     "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"+
     "The web mode is a simple crawler following links within domain, default delay=10s.");
  }

  /** Post all filenames provided in args
   * @param args array of file names
   * @param startIndexInArgs offset to start
   * @param out output stream to post data to
   * @param type default content-type to use when posting (may be overridden in auto mode)
   * @return number of files posted
   * */
  public int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
    reset();
    int filesPosted = 0;
    for (int j = startIndexInArgs; j < args.length; j++) {
      File srcFile = new File(args[j]);
      if(srcFile.isDirectory() && srcFile.canRead()) {
        filesPosted += postDirectory(srcFile, out, type);
      } else if (srcFile.isFile() && srcFile.canRead()) {
        filesPosted += postFiles(new File[] {srcFile}, out, type);
      } else {
        File parent = srcFile.getParentFile();
        if(parent == null) parent = new File(".");
        String fileGlob = srcFile.getName();
        GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
        File[] files = parent.listFiles(ff);
        if(files == null || files.length == 0) {
          warn("No files or directories matching "+srcFile);
          continue;          
        }
        filesPosted += postFiles(parent.listFiles(ff), out, type);
      }
    }
    return filesPosted;
  }
  
  /** Post all filenames provided in args
   * @param files array of Files
   * @param startIndexInArgs offset to start
   * @param out output stream to post data to
   * @param type default content-type to use when posting (may be overridden in auto mode)
   * @return number of files posted
   * */
  public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {
    reset();
    int filesPosted = 0;
    for (File srcFile : files) {
      if(srcFile.isDirectory() && srcFile.canRead()) {
        filesPosted += postDirectory(srcFile, out, type);
      } else if (srcFile.isFile() && srcFile.canRead()) {
        filesPosted += postFiles(new File[] {srcFile}, out, type);
      } else {
        File parent = srcFile.getParentFile();
        if(parent == null) parent = new File(".");
        String fileGlob = srcFile.getName();
        GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
        File[] fileList = parent.listFiles(ff);
        if(fileList == null || fileList.length == 0) {
          warn("No files or directories matching "+srcFile);
          continue;          
        }
        filesPosted += postFiles(fileList, out, type);
      }
    }
    return filesPosted;
  }
  
  /**
   * Posts a whole directory
   * @return number of files posted total
   */
  private int postDirectory(File dir, OutputStream out, String type) {
    if(dir.isHidden() && !dir.getName().equals("."))
      return(0);
    info("Indexing directory "+dir.getPath()+" ("+dir.listFiles(globFileFilter).length+" files, depth="+currentDepth+")");
    int posted = 0;
    posted += postFiles(dir.listFiles(globFileFilter), out, type);
    if(recursive > currentDepth) {
      for(File d : dir.listFiles()) {
        if(d.isDirectory()) {
          currentDepth++;
          posted += postDirectory(d, out, type);
          currentDepth--;
        }
      }
    }
    return posted;
  }

  /**
   * Posts a list of file names
   * @return number of files posted
   */
  int postFiles(File[] files, OutputStream out, String type) {
    int filesPosted = 0;
    for(File srcFile : files) {
      try {
        if(!srcFile.isFile() || srcFile.isHidden())
          continue;
        postFile(srcFile, out, type);
        Thread.sleep(delay * 1000);
        filesPosted++;
      } catch (InterruptedException e) {
        throw new RuntimeException();
      }
    }
    return filesPosted;
  }

  /**
   * This method takes as input a list of start URL strings for crawling,
   * adds each one to a backlog and then starts crawling
   * @param args the raw input args from main()
   * @param startIndexInArgs offset for where to start
   * @param out outputStream to write results to
   * @return the number of web pages posted
   */
  public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
    reset();
    LinkedHashSet s = new LinkedHashSet();
    for (int j = startIndexInArgs; j < args.length; j++) {
      try {
        URL u = new URL(normalizeUrlEnding(args[j]));
        s.add(u);
      } catch(MalformedURLException e) {
        warn("Skipping malformed input URL: "+args[j]);
      }
    }
    // Add URLs to level 0 of the backlog and start recursive crawling
    backlog.add(s);
    return webCrawl(0, out);
  }

  /**
   * Normalizes a URL string by removing anchor part and trailing slash
   * @return the normalized URL string
   */
  protected static String normalizeUrlEnding(String link) {
    if(link.indexOf("#") > -1)
      link = link.substring(0,link.indexOf("#"));
    if(link.endsWith("?"))
      link = link.substring(0,link.length()-1);
    if(link.endsWith("/"))
      link = link.substring(0,link.length()-1);
    return link;
  }

  /**
   * A very simple crawler, pulling URLs to fetch from a backlog and then
   * recurses N levels deep if recursive>0. Links are parsed from HTML
   * through first getting an XHTML version using SolrCell with extractOnly,
   * and followed if they are local. The crawler pauses for a default delay
   * of 10 seconds between each fetch, this can be configured in the delay
   * variable. This is only meant for test purposes, as it does not respect
   * robots or anything else fancy :)
   * @param level which level to crawl
   * @param out output stream to write to
   * @return number of pages crawled on this level and below
   */
  protected int webCrawl(int level, OutputStream out) {
    int numPages = 0;
    LinkedHashSet stack = backlog.get(level);
    int rawStackSize = stack.size();
    stack.removeAll(visited);
    int stackSize = stack.size();
    LinkedHashSet subStack = new LinkedHashSet();
    info("Entering crawl at level "+level+" ("+rawStackSize+" links total, "+stackSize+" new)");
    for(URL u : stack) {
      try {
        visited.add(u);
        PageFetcherResult result = pageFetcher.readPageFromUrl(u);
        if(result.httpStatus == 200) {
          u = (result.redirectUrl != null) ? result.redirectUrl : u;
          URL postUrl = new URL(appendParam(solrUrl.toString(), 
              "literal.id="+URLEncoder.encode(u.toString(),"UTF-8") +
              "&literal.url="+URLEncoder.encode(u.toString(),"UTF-8")));
          boolean success = postData(new ByteArrayInputStream(result.content), null, out, result.contentType, postUrl);
          if (success) {
            info("POSTed web resource "+u+" (depth: "+level+")");
            Thread.sleep(delay * 1000);
            numPages++;
            // Pull links from HTML pages only
            if(recursive > level && result.contentType.equals("text/html")) {
              Set children = pageFetcher.getLinksFromWebPage(u, new ByteArrayInputStream(result.content), result.contentType, postUrl);
              subStack.addAll(children);
            }
          } else {
            warn("An error occurred while posting "+u);
          }
        } else {
          warn("The URL "+u+" returned a HTTP result status of "+result.httpStatus);
        }
      } catch (IOException e) {
        warn("Caught exception when trying to open connection to "+u+": "+e.getMessage());
      } catch (InterruptedException e) {
        throw new RuntimeException();
      }
    }
    if(!subStack.isEmpty()) {
      backlog.add(subStack);
      numPages += webCrawl(level+1, out);
    }
    return numPages;    
  }

  /**
   * Reads an input stream into a byte array
   * @param is the input stream
   * @return the byte array
   * @throws IOException If there is a low-level I/O error.
   */
  protected byte[] inputStreamToByteArray(InputStream is) throws IOException {
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    int next = is.read();
    while (next > -1) {
        bos.write(next);
        next = is.read();
    }
    bos.flush();
    is.close();
    return bos.toByteArray();
  }

  /**
   * Computes the full URL based on a base url and a possibly relative link found
   * in the href param of an HTML anchor.
   * @param baseUrl the base url from where the link was found
   * @param link the absolute or relative link
   * @return the string version of the full URL
   */
  protected String computeFullUrl(URL baseUrl, String link) {
    if(link == null || link.length() == 0) {
      return null;
    }
    if(!link.startsWith("http")) {
      if(link.startsWith("/")) {
        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
      } else {
        if(link.contains(":")) {
          return null; // Skip non-relative URLs
        }
        String path = baseUrl.getPath();
        if(!path.endsWith("/")) {
          int sep = path.lastIndexOf("/");
          String file = path.substring(sep+1);
          if(file.contains(".") || file.contains("?"))
            path = path.substring(0,sep);
        }
        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
      }
    }
    link = normalizeUrlEnding(link);
    String l = link.toLowerCase(Locale.ROOT);
    // Simple brute force skip images
    if(l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
      return null; // Skip images
    }
    return link;
  }

  /**
   * Uses the mime-type map to reverse lookup whether the file ending for our type
   * is supported by the fileTypes option
   * @param type what content-type to lookup
   * @return true if this is a supported content type
   */
  protected boolean typeSupported(String type) {
    for(String key : mimeMap.keySet()) {
      if(mimeMap.get(key).equals(type)) {
        if(fileTypes.contains(key))
          return true;
      }
    }
    return false;
  }

  /**
   * Tests if a string is either "true", "on", "yes" or "1"
   * @param property the string to test
   * @return true if "on"
   */
  protected static boolean isOn(String property) {
    return("true,on,yes,1".indexOf(property) > -1);
  }
  
  static void warn(String msg) {
    System.err.println("SimplePostTool: WARNING: " + msg);
  }

  static void info(String msg) {
    System.out.println(msg);
  }

  static void fatal(String msg) {
    System.err.println("SimplePostTool: FATAL: " + msg);
    System.exit(2);
  }

  /**
   * Does a simple commit operation 
   */
  public void commit() {
    info("COMMITting Solr index changes to " + solrUrl + "..");
    doGet(appendParam(solrUrl.toString(), "commit=true"));
  }

  /**
   * Does a simple optimize operation 
   */
  public void optimize() {
    info("Performing an OPTIMIZE to " + solrUrl + "..");
    doGet(appendParam(solrUrl.toString(), "optimize=true"));
  }

  /**
   * Appends a URL query parameter to a URL 
   * @param url the original URL
   * @param param the parameter(s) to append, separated by "&"
   * @return the string version of the resulting URL
   */
  public static String appendParam(String url, String param) {
    String[] pa = param.split("&");
    for(String p : pa) {
      if(p.trim().length() == 0) continue;
      String[] kv = p.split("=");
      if(kv.length == 2) {
        url = url + (url.indexOf('?')>0 ? "&" : "?") + kv[0] +"="+ kv[1];
      } else {
        warn("Skipping param "+p+" which is not on form key=value");
      }
    }
    return url;
  }

  /**
   * Opens the file and posts it's contents to the solrUrl,
   * writes to response to output. 
   */
  public void postFile(File file, OutputStream output, String type) {
    InputStream is = null;
    try {
      URL url = solrUrl;
      if(auto) {
        if(type == null) {
          type = guessType(file);
        }
        if(type != null) {
          if(type.equals("text/xml") || type.equals("text/csv") || type.equals("application/json")) {
            // Default handler
          } else {
            // SolrCell
            String urlStr = appendUrlPath(solrUrl, "/extract").toString();
            if(urlStr.indexOf("resource.name")==-1)
              urlStr = appendParam(urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
            if(urlStr.indexOf("literal.id")==-1)
              urlStr = appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
            url = new URL(urlStr);
          }
        } else {
          warn("Skipping "+file.getName()+". Unsupported file type for auto mode.");
          return;
        }
      } else {
        if(type == null) type = DEFAULT_CONTENT_TYPE;
      }
      info("POSTing file " + file.getName() + (auto?" ("+type+")":""));
      is = new FileInputStream(file);
      postData(is, (int)file.length(), output, type, url);
    } catch (IOException e) {
      e.printStackTrace();
      warn("Can't open/read file: " + file);
    } finally {
      try {
        if(is!=null) is.close();
      } catch (IOException e) {
        fatal("IOException while closing file: "+ e);
      }
    }
  }

  /**
   * Appends to the path of the URL
   * @param url the URL
   * @param append the path to append
   * @return the final URL version 
   */
  protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
    return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append + (url.getQuery() != null ? "?"+url.getQuery() : ""));
  }

  /**
   * Guesses the type of a file, based on file name suffix
   * @param file the file
   * @return the content-type guessed
   */
  protected static String guessType(File file) {
    String name = file.getName();
    String suffix = name.substring(name.lastIndexOf(".")+1);
    return mimeMap.get(suffix.toLowerCase(Locale.ROOT));
  }

  /**
   * Performs a simple get on the given URL
   */
  public static void doGet(String url) {
    try {
      doGet(new URL(url));
    } catch (MalformedURLException e) {
      warn("The specified URL "+url+" is not a valid URL. Please check");
    }
  }
  
  /**
   * Performs a simple get on the given URL
   */
  public static void doGet(URL url) {
    try {
      if(mockMode) return;
      HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
      if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
        warn("Solr returned an error #" + urlc.getResponseCode() + 
            " " + urlc.getResponseMessage() + " for url "+url);
      }
    } catch (IOException e) {
      warn("An error occurred posting data to "+url+". Please check that Solr is running.");
    }
  }

  /**
   * Reads data from the data stream and posts it to solr,
   * writes to the response to output
   * @return true if success
   */
  public boolean postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
    if(mockMode) return true;
    boolean success = true;
    if(type == null)
      type = DEFAULT_CONTENT_TYPE;
    HttpURLConnection urlc = null;
    try {
      try {
        urlc = (HttpURLConnection) url.openConnection();
        try {
          urlc.setRequestMethod("POST");
        } catch (ProtocolException e) {
          fatal("Shouldn't happen: HttpURLConnection doesn't support POST??"+e);
        }
        urlc.setDoOutput(true);
        urlc.setDoInput(true);
        urlc.setUseCaches(false);
        urlc.setAllowUserInteraction(false);
        urlc.setRequestProperty("Content-type", type);

        if (null != length) urlc.setFixedLengthStreamingMode(length);

      } catch (IOException e) {
        fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
        success = false;
      }
      
      OutputStream out = null;
      try {
        out = urlc.getOutputStream();
        pipe(data, out);
      } catch (IOException e) {
        fatal("IOException while posting data: " + e);
        success = false;
      } finally {
        try { if(out!=null) out.close(); } catch (IOException x) { /*NOOP*/ }
      }
      
      InputStream in = null;
      try {
        if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
          warn("Solr returned an error #" + urlc.getResponseCode() + 
                " " + urlc.getResponseMessage());
          success = false;
        }

        in = urlc.getInputStream();
        pipe(in, output);
      } catch (IOException e) {
        warn("IOException while reading response: " + e);
        success = false;
      } finally {
        try { if(in!=null) in.close(); } catch (IOException x) { /*NOOP*/ }
      }
      
    } finally {
      if(urlc!=null) urlc.disconnect();
    }
    return success;
  }

  /**
   * Converts a string to an input stream 
   * @param s the string
   * @return the input stream
   */
  public static InputStream stringToStream(String s) {
    InputStream is = null;
    try {
      is = new ByteArrayInputStream(s.getBytes("UTF-8"));
    } catch (UnsupportedEncodingException e) {
      fatal("Shouldn't happen: UTF-8 not supported?!?!?!");
    }
    return is;
  }

  /**
   * Pipes everything from the source to the dest.  If dest is null, 
   * then everything is read from source and thrown away.
   */
  private static void pipe(InputStream source, OutputStream dest) throws IOException {
    byte[] buf = new byte[1024];
    int read = 0;
    while ( (read = source.read(buf) ) >= 0) {
      if (null != dest) dest.write(buf, 0, read);
    }
    if (null != dest) dest.flush();
  }

  public GlobFileFilter getFileFilterFromFileTypes(String fileTypes) {
    String glob;
    if(fileTypes.equals("*"))
      glob = ".*";
    else
      glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
    return new GlobFileFilter(glob, true);
  }

  //
  // Utility methods for XPath handing
  //
  
  /**
   * Gets all nodes matching an XPath
   */
  public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
    XPathFactory factory = XPathFactory.newInstance();
    XPath xp = factory.newXPath();
    XPathExpression expr = xp.compile(xpath);
    return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
  }
  
  /**
   * Gets the string content of the matching an XPath
   * @param n the node (or doc)
   * @param xpath the xpath string
   * @param concatAll if true, text from all matching nodes will be concatenated, else only the first returned
   */
  public static String getXP(Node n, String xpath, boolean concatAll)
      throws XPathExpressionException {
    NodeList nodes = getNodesFromXP(n, xpath);
    StringBuilder sb = new StringBuilder();
    if (nodes.getLength() > 0) {
      for(int i = 0; i < nodes.getLength() ; i++) {
        sb.append(nodes.item(i).getNodeValue() + " ");
        if(!concatAll) break;
      }
      return sb.toString().trim();
    } else
      return "";
  }
  
  /**
   * Takes a string as input and returns a DOM 
   */
  public static Document makeDom(String in, String inputEncoding) throws SAXException, IOException,
  ParserConfigurationException {
    InputStream is = new ByteArrayInputStream(in
        .getBytes(inputEncoding));
    Document dom = DocumentBuilderFactory.newInstance()
        .newDocumentBuilder().parse(is);
    return dom;
  }

  /**
   * Inner class to filter files based on glob wildcards
   */
  class GlobFileFilter implements FileFilter
  {
    private String _pattern;
    private Pattern p;
    
    public GlobFileFilter(String pattern, boolean isRegex)
    {
      _pattern = pattern;
      if(!isRegex) {
        _pattern = _pattern
            .replace("^", "\\^")
            .replace("$", "\\$")
            .replace(".", "\\.")
            .replace("(", "\\(")
            .replace(")", "\\)")
            .replace("+", "\\+")
            .replace("*", ".*")
            .replace("?", ".");
        _pattern = "^" + _pattern + "$";
      }
      
      try {
        p = Pattern.compile(_pattern,Pattern.CASE_INSENSITIVE);
      } catch(PatternSyntaxException e) {
        fatal("Invalid type list "+pattern+". "+e.getDescription());
      }
    }
    
    @Override
    public boolean accept(File file)
    {
      return p.matcher(file.getName()).find();
    }
  }
  
  //
  // Simple crawler class which can fetch a page and check for robots.txt
  //
  class PageFetcher {
    Map> robotsCache;
    final String DISALLOW = "Disallow:";
    
    public PageFetcher() {
      robotsCache = new HashMap>();
    }
    
    public PageFetcherResult readPageFromUrl(URL u) {
      PageFetcherResult res = new PageFetcherResult();
      try {
        if (isDisallowedByRobots(u)) {
          warn("The URL "+u+" is disallowed by robots.txt and will not be crawled.");
          res.httpStatus = 403;
          visited.add(u);
          return res;
        }
        res.httpStatus = 404;
        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
        conn.setRequestProperty("User-Agent", "SimplePostTool-crawler/"+VERSION_OF_THIS_TOOL+" (http://lucene.apache.org/solr/)");
        conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
        conn.connect();
        res.httpStatus = conn.getResponseCode();
        if(!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
          info("The URL "+u+" caused a redirect to "+conn.getURL());
          u = conn.getURL();
          res.redirectUrl = u;
          visited.add(u);
        }
        if(res.httpStatus == 200) {
          // Raw content type of form "text/html; encoding=utf-8"
          String rawContentType = conn.getContentType();
          String type = rawContentType.split(";")[0];
          if(typeSupported(type)) {
            String encoding = conn.getContentEncoding();
            InputStream is;
            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
              is = new GZIPInputStream(conn.getInputStream());
            } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
              is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
            } else {
              is = conn.getInputStream();
            }
            
            // Read into memory, so that we later can pull links from the page without re-fetching 
            res.content = inputStreamToByteArray(is);
            is.close();
          } else {
            warn("Skipping URL with unsupported type "+type);
            res.httpStatus = 415;
          }
        }
      } catch(IOException e) {
        warn("IOException when reading page from url "+u+": "+e.getMessage());
      }
      return res;
    }
    
    public boolean isDisallowedByRobots(URL url) {
      String host = url.getHost();
      String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
      List disallows = robotsCache.get(host);
      if(disallows == null) {
        disallows = new ArrayList();
        URL urlRobot;
        try { 
          urlRobot = new URL(strRobot);
          disallows = parseRobotsTxt(urlRobot.openStream());
        } catch (MalformedURLException e) {
          return true; // We cannot trust this robots URL, should not happen
        } catch (IOException e) {
          // There is no robots.txt, will cache an empty disallow list
        }
      }
      
      robotsCache.put(host, disallows);

      String strURL = url.getFile();
      for (String path : disallows) {
        if (path.equals("/") || strURL.indexOf(path) == 0)
          return true;
      }
      return false;
    }

    /**
     * Very simple robots.txt parser which obeys all Disallow lines regardless
     * of user agent or whether there are valid Allow: lines.
     * @param is Input stream of the robots.txt file
     * @return a list of disallow paths
     * @throws IOException if problems reading the stream
     */
    protected List parseRobotsTxt(InputStream is) throws IOException {
      List disallows = new ArrayList();
      BufferedReader r = new BufferedReader(new InputStreamReader(is, "UTF-8"));
      String l;
      while((l = r.readLine()) != null) {
        String[] arr = l.split("#");
        if(arr.length == 0) continue;
        l = arr[0].trim();
        if(l.startsWith(DISALLOW)) {
          l = l.substring(DISALLOW.length()).trim();
          if(l.length() == 0) continue;
          disallows.add(l);
        }
      }
      is.close();
      return disallows;
    }

    /**
     * Finds links on a web page, using /extract?extractOnly=true
     * @param u the URL of the web page
     * @param is the input stream of the page
     * @param type the content-type
     * @param postUrl the URL (typically /solr/extract) in order to pull out links
     * @return a set of URLs parsed from the page
     */
    protected Set getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
      Set l = new HashSet();
      URL url = null;
      try {
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
        boolean success = postData(is, null, os, type, extractUrl);
        if(success) {
          String rawXml = os.toString("UTF-8");
          Document d = makeDom(rawXml, "UTF-8");
          String innerXml = getXP(d, "/response/str/text()[1]", false);
          d = makeDom(innerXml, "UTF-8");
          NodeList links = getNodesFromXP(d, "/html/body//a/@href");
          for(int i = 0; i < links.getLength(); i++) {
            String link = links.item(i).getTextContent();
            link = computeFullUrl(u, link);
            if(link == null)
              continue;
            url = new URL(link);
            if(url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority()))
              continue;
            l.add(url);
          }
        }
      } catch (MalformedURLException e) {
        warn("Malformed URL "+url);
      } catch (IOException e) {
        warn("IOException opening URL "+url+": "+e.getMessage());
      } catch (Exception e) {
        throw new RuntimeException();
      }
      return l;
    }
  }
    
  /**
   * Utility class to hold the result form a page fetch
   */
  public class PageFetcherResult {
    int httpStatus = 200;
    String contentType = "text/html";
    URL redirectUrl = null;
    byte[] content;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy