src.it.unimi.dsi.parser.callback.LinkExtractor Maven / Gradle / Ivy
package it.unimi.dsi.parser.callback;
/*
* DSI utilities
*
* Copyright (C) 2005-2017 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.util.TextPattern;
import java.util.Map;
import java.util.Set;
/**
* A callback extracting links.
*
* This callbacks extracts links existing in the web page. The
* links are then accessible in {@link #urls} (a set of {@link String}s). Note that
* we guarantee that the iteration order in the set is exactly
* the order in which links have been met (albeit copies appear
* just once).
*/
public class LinkExtractor extends DefaultCallback {
/** The pattern prefixing the URL in a META
HTTP-EQUIV
element of refresh type. */
private static final TextPattern URLEQUAL_PATTERN = new TextPattern("URL=", TextPattern.CASE_INSENSITIVE);
/** The URLs resulting from the parsing process. */
public final Set urls = new ObjectLinkedOpenHashSet<>();
/** The URL contained in the first META
HTTP-EQUIV
element of refresh type (if any). */
private String metaRefresh = null;
/** The URL contained in the first META
HTTP-EQUIV
element of location type (if any). */
private String metaLocation = null;
/** The URL contained in the first BASE
element (if any). */
private String base = null;
/**
* Configure the parser to parse elements and certain attributes.
*
*
* The required attributes are SRC
, HREF
, HTTP-EQUIV
, and CONTENT
*
.
*
*/
@Override
public void configure(final BulletParser parser) {
parser.parseTags(true);
parser.parseAttributes(true);
parser.parseAttribute(Attribute.SRC);
parser.parseAttribute(Attribute.HREF);
parser.parseAttribute(Attribute.HTTP_EQUIV);
parser.parseAttribute(Attribute.CONTENT);
}
@Override
public void startDocument() {
urls.clear();
base = metaLocation = metaRefresh = null;
}
@Override
public boolean startElement(final Element element, final Map attrMap) {
Object s;
// TODO: what about IMG?
if (element == Element.A || element == Element.AREA || element == Element.LINK) {
s = attrMap.get(Attribute.HREF);
if (s != null)
urls.add(s.toString());
}
// IFRAME or FRAME + SRC
if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) {
s = attrMap.get(Attribute.SRC);
if (s != null)
urls.add(s.toString());
}
// BASE + HREF (change context!)
if (element == Element.BASE && base == null) {
s = attrMap.get(Attribute.HREF);
if (s != null)
base = s.toString();
}
// META REFRESH/LOCATION
if (element == Element.META) {
final MutableString equiv = attrMap.get(Attribute.HTTP_EQUIV);
final MutableString content = attrMap.get(Attribute.CONTENT);
if (equiv != null && content != null) {
equiv.toLowerCase();
// http-equiv="refresh" content="0;URL=http://foo.bar/..."
if (equiv.equals("refresh") && (metaRefresh == null)) {
final int pos = URLEQUAL_PATTERN.search(content);
if (pos != -1)
metaRefresh = content.substring(pos + URLEQUAL_PATTERN.length()).toString();
}
// http-equiv="location" content="http://foo.bar/..."
if (equiv.equals("location") && (metaLocation == null))
metaLocation = attrMap.get(Attribute.CONTENT).toString();
}
}
return true;
}
/**
* Returns the URL specified by META
HTTP-EQUIV
elements of location type. More
* precisely, this method returns a non- {@code null} result iff there is at least one META HTTP-EQUIV
*
element specifying a location URL (if there is more than one, we keep the first one).
*
* @return the first URL specified by a META
HTTP-EQUIV
elements of location type, or
* {@code null}.
*/
public String metaLocation() {
return metaLocation;
}
/**
* Returns the URL specified by the BASE
element. More precisely, this method returns a non-
* {@code null} result iff there is at least one BASE
element specifying a derelativisation URL
* (if there is more than one, we keep the first one).
*
* @return the first URL specified by a BASE
element, or {@code null}.
*/
public String base() {
return base;
}
/**
* Returns the URL specified by META
HTTP-EQUIV
elements of refresh type. More
* precisely, this method returns a non- {@code null} result iff there is at least one META HTTP-EQUIV
*
element specifying a refresh URL (if there is more than one, we keep the first one).
*
* @return the first URL specified by a META
HTTP-EQUIV
elements of refresh type, or
* {@code null}.
*/
public String metaRefresh() {
return metaRefresh;
}
}