thredds.util.DodsURLExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of netcdf Show documentation
Show all versions of netcdf Show documentation
The NetCDF-Java Library is a Java interface to NetCDF files,
as well as to many other types of scientific data formats.
/*
* Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
*
* Portions of this software were developed by the Unidata Program at the
* University Corporation for Atmospheric Research.
*
* Access and use of this software shall impose the following obligations
* and understandings on the user. The user is granted the right, without
* any fee or cost, to use, copy, modify, alter, enhance and distribute
* this software, and any derivative works thereof, and its supporting
* documentation for any purpose whatsoever, provided that this entire
* notice appears in all copies of the software, derivative works and
* supporting documentation. Further, UCAR requests that the user credit
* UCAR/Unidata in any publications that result from the use of this
* software or in any product that includes this software. The names UCAR
* and/or Unidata, however, may not be used in any advertising or publicity
* to endorse or promote any products or commercial entity unless specific
* written permission is obtained from UCAR/Unidata. The user also
* understands that UCAR/Unidata is not obligated to provide the user with
* any support, consulting, training or assistance of any kind with regard
* to the use, operation and performance of this software nor to provide
* the user with any updates, revisions, new versions or "bug fixes."
*
* THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
* INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
* FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
* WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
*/
// Sample code from John that parses HTML and looks for URLs
package thredds.util;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Enumeration;
public class DodsURLExtractor {
private HTMLEditorKit.Parser parser;
private ArrayList urlList;
private URL baseURL;
private boolean wantURLS = false;
private String title;
private boolean isTitle;
private StringBuffer textBuffer;
private boolean wantText = false;
private boolean debug = false;
/** Constructor */
public DodsURLExtractor() {
ParserGetter kit = new ParserGetter();
parser = kit.getParser();
}
/** Extract all A-HREF contained URLS from the given URL and return in List */
public ArrayList extract(String url) throws IOException {
if (debug) System.out.println(" URLextract="+url);
baseURL = new URL(url);
InputStream in = baseURL.openStream();
InputStreamReader r = new InputStreamReader(filterTag(in));
HTMLEditorKit.ParserCallback callback = new CallerBacker();
urlList = new ArrayList();
wantURLS = true;
wantText = false;
parser.parse(r, callback, false);
return urlList;
}
/** Extract text content from the given URL and return in String */
public String getTextContent(String url) throws IOException {
if (debug) System.out.println(" URL.getTextContent="+url);
baseURL = new URL(url);
InputStream in = baseURL.openStream();
InputStreamReader r = new InputStreamReader(filterTag(in));
HTMLEditorKit.ParserCallback callback = new CallerBacker();
textBuffer = new StringBuffer(3000);
wantURLS = false;
wantText = true;
parser.parse(r, callback, false);
return textBuffer.toString();
}
// workaround for HTMLEditorKit.Parser, cant deal with "content-encoding"
private InputStream filterTag(InputStream in) throws IOException {
BufferedReader buffIn = new BufferedReader(new InputStreamReader(in));
ByteArrayOutputStream bos = new ByteArrayOutputStream(10000);
String line = buffIn.readLine();
while ( line != null ) {
String lline = line.toLowerCase();
if ( 0 <= lline.indexOf( ""+" value= <"+value+ ">");
try {
if (name == HTML.Attribute.HREF) {
URL u = baseURL.toURI().resolve( value ).toURL();
String urlName = u.toString();
if (urlList != null)
urlList.add( u.toString());
if (debug) System.out.println(" extracted URL= <"+urlName+ ">");
}
} catch (MalformedURLException ex) {
System.err.println(ex);
System.err.println(baseURL);
System.err.println(value);
ex.printStackTrace();
}
catch( URISyntaxException ex)
{
System.err.println( ex );
System.err.println( baseURL );
System.err.println( value );
ex.printStackTrace();
}
} // while
} // extractHREF
public void handleText(char[] text, int position) {
if (isTitle) title = new String( text);
if (wantText) {
textBuffer.append( text);
textBuffer.append( ' ');
}
}
} // Callerbacker
private class ParserGetter extends HTMLEditorKit {
// purely to make this method public
public HTMLEditorKit.Parser getParser(){
return super.getParser();
}
}
}