All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.parser.callback.LinkExtractor Maven / Gradle / Ivy

package it.unimi.dsi.parser.callback;

/*
 * DSI utilities
 *
 * Copyright (C) 2005-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.util.TextPattern;

import java.util.Map;
import java.util.Set;

/**
 * A callback extracting links.
 *
 * 

This callbacks extracts links existing in the web page. The * links are then accessible in {@link #urls} (a set of {@link String}s). Note that * we guarantee that the iteration order in the set is exactly * the order in which links have been met (albeit copies appear * just once). */ public class LinkExtractor extends DefaultCallback { /** The pattern prefixing the URL in a META HTTP-EQUIV element of refresh type. */ private static final TextPattern URLEQUAL_PATTERN = new TextPattern("URL=", TextPattern.CASE_INSENSITIVE); /** The URLs resulting from the parsing process. */ public final Set urls = new ObjectLinkedOpenHashSet<>(); /** The URL contained in the first META HTTP-EQUIV element of refresh type (if any). */ private String metaRefresh = null; /** The URL contained in the first META HTTP-EQUIV element of location type (if any). */ private String metaLocation = null; /** The URL contained in the first BASE element (if any). */ private String base = null; /** * Configure the parser to parse elements and certain attributes. * *

* The required attributes are SRC , HREF , HTTP-EQUIV , and CONTENT * . * */ @Override public void configure(final BulletParser parser) { parser.parseTags(true); parser.parseAttributes(true); parser.parseAttribute(Attribute.SRC); parser.parseAttribute(Attribute.HREF); parser.parseAttribute(Attribute.HTTP_EQUIV); parser.parseAttribute(Attribute.CONTENT); } @Override public void startDocument() { urls.clear(); base = metaLocation = metaRefresh = null; } @Override public boolean startElement(final Element element, final Map attrMap) { Object s; // TODO: what about IMG? if (element == Element.A || element == Element.AREA || element == Element.LINK) { s = attrMap.get(Attribute.HREF); if (s != null) urls.add(s.toString()); } // IFRAME or FRAME + SRC if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) { s = attrMap.get(Attribute.SRC); if (s != null) urls.add(s.toString()); } // BASE + HREF (change context!) if (element == Element.BASE && base == null) { s = attrMap.get(Attribute.HREF); if (s != null) base = s.toString(); } // META REFRESH/LOCATION if (element == Element.META) { final MutableString equiv = attrMap.get(Attribute.HTTP_EQUIV); final MutableString content = attrMap.get(Attribute.CONTENT); if (equiv != null && content != null) { equiv.toLowerCase(); // http-equiv="refresh" content="0;URL=http://foo.bar/..." if (equiv.equals("refresh") && (metaRefresh == null)) { final int pos = URLEQUAL_PATTERN.search(content); if (pos != -1) metaRefresh = content.substring(pos + URLEQUAL_PATTERN.length()).toString(); } // http-equiv="location" content="http://foo.bar/..." if (equiv.equals("location") && (metaLocation == null)) metaLocation = attrMap.get(Attribute.CONTENT).toString(); } } return true; } /** * Returns the URL specified by META HTTP-EQUIV elements of location type. More * precisely, this method returns a non- {@code null} result iff there is at least one META HTTP-EQUIV * element specifying a location URL (if there is more than one, we keep the first one). * * @return the first URL specified by a META HTTP-EQUIV elements of location type, or * {@code null}. */ public String metaLocation() { return metaLocation; } /** * Returns the URL specified by the BASE element. More precisely, this method returns a non- * {@code null} result iff there is at least one BASE element specifying a derelativisation URL * (if there is more than one, we keep the first one). * * @return the first URL specified by a BASE element, or {@code null}. */ public String base() { return base; } /** * Returns the URL specified by META HTTP-EQUIV elements of refresh type. More * precisely, this method returns a non- {@code null} result iff there is at least one META HTTP-EQUIV * element specifying a refresh URL (if there is more than one, we keep the first one). * * @return the first URL specified by a META HTTP-EQUIV elements of refresh type, or * {@code null}. */ public String metaRefresh() { return metaRefresh; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy