thredds.util.DodsURLExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of netcdf Show documentation
The NetCDF-Java Library is a Java interface to NetCDF files, as well as to many other types of scientific data formats.
The newest version!
/*
 * Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
 *
 * Portions of this software were developed by the Unidata Program at the
 * University Corporation for Atmospheric Research.
 *
 * Access and use of this software shall impose the following obligations
 * and understandings on the user. The user is granted the right, without
 * any fee or cost, to use, copy, modify, alter, enhance and distribute
 * this software, and any derivative works thereof, and its supporting
 * documentation for any purpose whatsoever, provided that this entire
 * notice appears in all copies of the software, derivative works and
 * supporting documentation.  Further, UCAR requests that the user credit
 * UCAR/Unidata in any publications that result from the use of this
 * software or in any product that includes this software. The names UCAR
 * and/or Unidata, however, may not be used in any advertising or publicity
 * to endorse or promote any products or commercial entity unless specific
 * written permission is obtained from UCAR/Unidata. The user also
 * understands that UCAR/Unidata is not obligated to provide the user with
 * any support, consulting, training or assistance of any kind with regard
 * to the use, operation and performance of this software nor to provide
 * the user with any updates, revisions, new versions or "bug fixes."
 *
 * THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
 * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
 */
// Sample code from John that parses HTML and looks for URLs


package thredds.util;

import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Enumeration;

public class DodsURLExtractor {
  private HTMLEditorKit.Parser parser;
  private ArrayList urlList;
  private URL baseURL;

  private boolean wantURLS = false;
  private String title;
  private boolean isTitle;

  private StringBuffer textBuffer;
  private boolean wantText = false;
  private boolean debug = false;

  /**
   * Constructor
   */
  public DodsURLExtractor() {
    ParserGetter kit = new ParserGetter();
    parser = kit.getParser();
  }

  /**
   * Extract all A-HREF contained URLS from the given URL and return in List
   */
  public ArrayList extract(String url) throws IOException {
    if (debug) System.out.println(" URLextract=" + url);

    baseURL = new URL(url);
    InputStream in = baseURL.openStream();
    InputStreamReader r = new InputStreamReader(filterTag(in));
    HTMLEditorKit.ParserCallback callback = new CallerBacker();

    urlList = new ArrayList();
    wantURLS = true;
    wantText = false;
    parser.parse(r, callback, false);

    return urlList;
  }

  /**
   * Extract text content from the given URL and return in String
   */
  public String getTextContent(String url) throws IOException {
    if (debug) System.out.println(" URL.getTextContent=" + url);

    baseURL = new URL(url);
    InputStream in = baseURL.openStream();
    InputStreamReader r = new InputStreamReader(filterTag(in));
    HTMLEditorKit.ParserCallback callback = new CallerBacker();

    textBuffer = new StringBuffer(3000);
    wantURLS = false;
    wantText = true;
    parser.parse(r, callback, false);

    return textBuffer.toString();
  }


  // workaround for HTMLEditorKit.Parser, cant deal with "content-encoding"
  private InputStream filterTag(InputStream in) throws IOException {
    BufferedReader buffIn = new BufferedReader(new InputStreamReader(in));
    ByteArrayOutputStream bos = new ByteArrayOutputStream(10000);

    String line = buffIn.readLine();
    while (line != null) {
      String lline = line.toLowerCase();
      if (0 <= lline.indexOf(""+" value= <"+value+ ">");
        try {
          if (name == HTML.Attribute.HREF) {
            URL u = baseURL.toURI().resolve(value).toURL();
            String urlName = u.toString();
            if (urlList != null)
              urlList.add(u.toString());
            if (debug) System.out.println(" extracted URL= <" + urlName + ">");
          }
        } catch (MalformedURLException ex) {
          System.err.println(ex);
          System.err.println(baseURL);
          System.err.println(value);
          ex.printStackTrace();
        } catch (URISyntaxException ex) {
          System.err.println(ex);
          System.err.println(baseURL);
          System.err.println(value);
          ex.printStackTrace();
        }
      } // while
    } // extractHREF

    public void handleText(char[] text, int position) {
      if (isTitle) title = new String(text);

      if (wantText) {
        textBuffer.append(text);
        textBuffer.append(' ');
      }
    }

  } // Callerbacker

  private class ParserGetter extends HTMLEditorKit {
    // purely to make this method public
    public HTMLEditorKit.Parser getParser() {
      return super.getParser();
    }
  }
}