All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.selector.HtmlNode Maven / Gradle / Ivy

There is a newer version: 1.0.2
Show newest version
package us.codecraft.webmagic.selector;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;

/**
 * @author [email protected]
 */
public class HtmlNode extends AbstractSelectable {

    private final List elements;

    public HtmlNode(List elements) {
        this.elements = elements;
    }

    public HtmlNode() {
        elements = null;
    }

    protected List getElements() {
        return elements;
    }

    @Override
    public Selectable smartContent() {
        SmartContentSelector smartContentSelector = Selectors.smartContent();
        return select(smartContentSelector, getSourceTexts());
    }

    @Override
    public Selectable links() {
        return xpath("//a/@href");
    }

    @Override
    public Selectable xpath(String xpath) {
        XpathSelector xpathSelector = Selectors.xpath(xpath);
        return selectElements(xpathSelector);
    }

    @Override
    public Selectable selectList(Selector selector) {
        if (selector instanceof BaseElementSelector) {
           return selectElements((BaseElementSelector) selector);
        }
        return selectList(selector, getSourceTexts());
    }

    @Override
    public Selectable select(Selector selector) {
        return selectList(selector);
    }

    /**
     * select elements
     *
     * @param elementSelector elementSelector
     * @return result
     */
    protected Selectable selectElements(BaseElementSelector elementSelector) {
        ListIterator elementIterator = getElements().listIterator();
        if (!elementSelector.hasAttribute()) {
            List resultElements = new ArrayList();
            while (elementIterator.hasNext()) {
                Element element = checkElementAndConvert(elementIterator);
                List selectElements = elementSelector.selectElements(element);
                resultElements.addAll(selectElements);
            }
            return new HtmlNode(resultElements);
        } else {
            // has attribute, consider as plaintext
            List resultStrings = new ArrayList();
            while (elementIterator.hasNext()) {
                Element element = checkElementAndConvert(elementIterator);
                List selectList = elementSelector.selectList(element);
                resultStrings.addAll(selectList);
            }
            return new PlainText(resultStrings);

        }
    }

    /**
     * Only document can be select
     * See: https://github.com/code4craft/webmagic/issues/113
     *
     * @param elementIterator elementIterator
     * @param element element
     */
    private Element checkElementAndConvert(ListIterator elementIterator) {
        Element element = elementIterator.next();
        if (!(element instanceof Document)) {
            Document root = new Document(element.ownerDocument().baseUri());
            Element clone = element.clone();
            root.appendChild(clone);
            elementIterator.set(root);
            return root;
        }
        return element;
    }

    @Override
    public Selectable $(String selector) {
        CssSelector cssSelector = Selectors.$(selector);
        return selectElements(cssSelector);
    }

    @Override
    public Selectable $(String selector, String attrName) {
        CssSelector cssSelector = Selectors.$(selector, attrName);
        return selectElements(cssSelector);
    }

    @Override
    public List nodes() {
        List selectables = new ArrayList();
        for (Element element : getElements()) {
            List childElements = new ArrayList(1);
            childElements.add(element);
            selectables.add(new HtmlNode(childElements));
        }
        return selectables;
    }

    @Override
    protected List getSourceTexts() {
        List sourceTexts = new ArrayList(getElements().size());
        for (Element element : getElements()) {
            sourceTexts.add(element.toString());
        }
        return sourceTexts;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy