All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.LinkContentHandler Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import static org.apache.tika.sax.XHTMLContentHandler.XHTML;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Content handler that collects links from an XHTML document.
 */
public class LinkContentHandler extends DefaultHandler {

    /**
     * Stack of link builders, one for each level of nested links currently
     * being processed. A usual case of a nested link would be a hyperlinked
     * image (&a href="..."><img src="..."><>),
     * but it's possible (though unlikely) for also other kinds of nesting
     * to occur.
     */
    private final LinkedList builderStack =
        new LinkedList();

    /** Collected links */
    private final List links = new ArrayList();
    
    /** Whether to collapse whitespace in anchor text */
    private boolean collapseWhitespaceInAnchor;
    
    /**
     * Default constructor
     */
    public LinkContentHandler() { 
        this(false);
    }
    
    /**
     * Default constructor
     *
     * @boolean collapseWhitespaceInAnchor
     */
    public LinkContentHandler(boolean collapseWhitespaceInAnchor) {
      super();
      
      this.collapseWhitespaceInAnchor = collapseWhitespaceInAnchor;
    }

    /**
     * Returns the list of collected links.
     *
     * @return collected links
     */
    public List getLinks() {
        return links;
    }

    //-------------------------------------------------------< ContentHandler>

    @Override
    public void startElement(
            String uri, String local, String name, Attributes attributes) {
        if (XHTML.equals(uri)) {
            if ("a".equals(local)) {
                LinkBuilder builder = new LinkBuilder("a");
                builder.setURI(attributes.getValue("", "href"));
                builder.setTitle(attributes.getValue("", "title"));
                builder.setRel(attributes.getValue("", "rel"));
                builderStack.addFirst(builder);
            } else if ("link".equals(local)) {
                LinkBuilder builder = new LinkBuilder("link");
                builder.setURI(attributes.getValue("", "href"));
                builder.setRel(attributes.getValue("", "rel"));
                builderStack.addFirst(builder);
            } else if ("script".equals(local)) {
                if (attributes.getValue("", "src") != null) {
                    LinkBuilder builder = new LinkBuilder("script");
                    builder.setURI(attributes.getValue("", "src"));
                    builderStack.addFirst(builder);
                }
            } else if ("iframe".equals(local)) {
                LinkBuilder builder = new LinkBuilder("iframe");
                builder.setURI(attributes.getValue("", "src"));
                builderStack.addFirst(builder);
            } else if ("img".equals(local)) {
                LinkBuilder builder = new LinkBuilder("img");
                builder.setURI(attributes.getValue("", "src"));
                builder.setTitle(attributes.getValue("", "title"));
                builder.setRel(attributes.getValue("", "rel"));
                builderStack.addFirst(builder);

                String alt = attributes.getValue("", "alt");
                if (alt != null) {
                    char[] ch = alt.toCharArray();
                    characters(ch, 0, ch.length);
                }
            }
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) {
        for (LinkBuilder builder : builderStack) {
            builder.characters(ch, start, length);
        }
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) {
        characters(ch, start, length);
    }

    @Override
    public void endElement(String uri, String local, String name) {
        if (!builderStack.isEmpty() && XHTML.equals(uri)) {
            if ("a".equals(local) || "img".equals(local) || "link".equals(local) ||
                    "script".equals(local) || "iframe".equals(local)) {
                // ensure this is the correct builder. not all  tags correspond
                // to a LinkBuilder, e.g. for embedded scripts
                if (builderStack.getFirst().getType().equals(local)) {
                    LinkBuilder builder = builderStack.removeFirst();
                    links.add(builder.getLink(collapseWhitespaceInAnchor));
                }
            }
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy