All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.parser.callback.LinkExtractor Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
/*
 * DSI utilities
 *
 * Copyright (C) 2005-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

package it.unimi.dsi.parser.callback;

import java.util.Map;
import java.util.Set;

import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.util.TextPattern;

/**
 * A callback extracting links.
 *
 * 

This callbacks extracts links existing in the web page. The * links are then accessible in {@link #urls} (a set of {@link String}s). Note that * we guarantee that the iteration order in the set is exactly * the order in which links have been met (albeit copies appear * just once). */ public class LinkExtractor extends DefaultCallback { /** The pattern prefixing the URL in a META HTTP-EQUIV element of refresh type. */ private static final TextPattern URLEQUAL_PATTERN = new TextPattern("URL=", TextPattern.CASE_INSENSITIVE); /** The URLs resulting from the parsing process. */ public final Set urls = new ObjectLinkedOpenHashSet<>(); /** The URL contained in the first META HTTP-EQUIV element of refresh type (if any). */ private String metaRefresh = null; /** The URL contained in the first META HTTP-EQUIV element of location type (if any). */ private String metaLocation = null; /** The URL contained in the first BASE element (if any). */ private String base = null; /** * Configure the parser to parse elements and certain attributes. * *

* The required attributes are SRC , HREF , HTTP-EQUIV , and CONTENT * . * */ @Override public void configure(final BulletParser parser) { parser.parseTags(true); parser.parseAttributes(true); parser.parseAttribute(Attribute.SRC); parser.parseAttribute(Attribute.HREF); parser.parseAttribute(Attribute.HTTP_EQUIV); parser.parseAttribute(Attribute.CONTENT); } @Override public void startDocument() { urls.clear(); base = metaLocation = metaRefresh = null; } @Override public boolean startElement(final Element element, final Map attrMap) { Object s; // TODO: what about IMG? if (element == Element.A || element == Element.AREA || element == Element.LINK) { s = attrMap.get(Attribute.HREF); if (s != null) urls.add(s.toString()); } // IFRAME or FRAME + SRC if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) { s = attrMap.get(Attribute.SRC); if (s != null) urls.add(s.toString()); } // BASE + HREF (change context!) if (element == Element.BASE && base == null) { s = attrMap.get(Attribute.HREF); if (s != null) base = s.toString(); } // META REFRESH/LOCATION if (element == Element.META) { final MutableString equiv = attrMap.get(Attribute.HTTP_EQUIV); final MutableString content = attrMap.get(Attribute.CONTENT); if (equiv != null && content != null) { equiv.toLowerCase(); // http-equiv="refresh" content="0;URL=http://foo.bar/..." if (equiv.equals("refresh") && (metaRefresh == null)) { final int pos = URLEQUAL_PATTERN.search(content); if (pos != -1) metaRefresh = content.substring(pos + URLEQUAL_PATTERN.length()).toString(); } // http-equiv="location" content="http://foo.bar/..." if (equiv.equals("location") && (metaLocation == null)) metaLocation = attrMap.get(Attribute.CONTENT).toString(); } } return true; } /** * Returns the URL specified by META HTTP-EQUIV elements of location type. More * precisely, this method returns a non- {@code null} result iff there is at least one META HTTP-EQUIV * element specifying a location URL (if there is more than one, we keep the first one). * * @return the first URL specified by a META HTTP-EQUIV elements of location type, or * {@code null}. */ public String metaLocation() { return metaLocation; } /** * Returns the URL specified by the BASE element. More precisely, this method returns a non- * {@code null} result iff there is at least one BASE element specifying a derelativisation URL * (if there is more than one, we keep the first one). * * @return the first URL specified by a BASE element, or {@code null}. */ public String base() { return base; } /** * Returns the URL specified by META HTTP-EQUIV elements of refresh type. More * precisely, this method returns a non- {@code null} result iff there is at least one META HTTP-EQUIV * element specifying a refresh URL (if there is more than one, we keep the first one). * * @return the first URL specified by a META HTTP-EQUIV elements of refresh type, or * {@code null}. */ public String metaRefresh() { return metaRefresh; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy