de.l3s.boilerpipe.sax.HTMLFetcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of webcontent-grabber Show documentation
Show all versions of webcontent-grabber Show documentation
A java client library to grab the webcontent
The newest version!
package de.l3s.boilerpipe.sax;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* A very simple HTTP/HTML fetcher, really just for demo purposes.
*
* @author Christian Kohlschütter
*/
public class HTMLFetcher {
private HTMLFetcher() {
}
private static final Pattern PAT_CHARSET = Pattern.compile("charset=([^; ]+)$");
/**
* Fetches the document at the given URL, using {@link URLConnection}.
* @param url
* @return
* @throws IOException
*/
public static HTMLDocument fetch(final URL url) throws IOException {
final URLConnection conn = url.openConnection();
final String ct = conn.getContentType();
Charset cs = Charset.forName("Cp1252");
if (ct != null) {
Matcher m = PAT_CHARSET.matcher(ct);
if(m.find()) {
final String charset = m.group(1);
try {
cs = Charset.forName(charset);
} catch (UnsupportedCharsetException e) {
// keep default
}
}
}
InputStream in = conn.getInputStream();
final String encoding = conn.getContentEncoding();
if(encoding != null) {
if("gzip".equalsIgnoreCase(encoding)) {
in = new GZIPInputStream(in);
} else {
System.err.println("WARN: unsupported Content-Encoding: "+encoding);
}
}
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buf = new byte[4096];
int r;
while ((r = in.read(buf)) != -1) {
bos.write(buf, 0, r);
}
in.close();
final byte[] data = bos.toByteArray();
return new HTMLDocument(data, cs);
}
}