All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.feilong.net.jsoup.JsoupUtil Maven / Gradle / Ivy

Go to download

feilong is a suite of core and expanded libraries that include utility classes, http, excel,cvs, io classes, and much much more.

There is a newer version: 4.0.8
Show newest version
/*
 * Copyright (C) 2008 feilong
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.feilong.net.jsoup;

import static com.feilong.core.TimeInterval.MILLISECOND_PER_SECONDS;

import java.io.IOException;

import javax.net.ssl.SSLContext;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.feilong.core.Validate;
import com.feilong.lib.lang3.StringUtils;
import com.feilong.net.SSLContextBuilder;
import com.feilong.net.SSLProtocol;
import com.feilong.net.UriProcessor;
import com.feilong.tools.slf4j.Slf4jUtil;

/**
 * Jsoup 的工具类.
 * 
 * @author feilong
 * @since 1.0.6
 */
public final class JsoupUtil{

    /** {@value}. */
    public static final int     DEFAULT_TIMEOUT_MILLIS = 20 * MILLISECOND_PER_SECONDS;

    /**
     * 伪造的 useragent.
     * 
     * @since 1.8.1
     */
    private static final String DEFAULT_USER_AGENT     = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21";

    //---------------------------------------------------------------
    /** Don't let anyone instantiate this class. */
    private JsoupUtil(){
        //AssertionError不是必须的. 但它可以避免不小心在类的内部调用构造器. 保证该类在任何情况下都不会被实例化.
        //see 《Effective Java》 2nd
        throw new AssertionError("No " + getClass().getName() + " instances for you!");
    }

    //---------------------------------------------------------------

    /**
     * 通过url 获得文档.
     * 
     * @param urlString
     *            the url string
     * @return the document
     * @throws JsoupUtilException
     *             if Exception
     */
    public static Document getDocument(String urlString){
        return getDocument(urlString, DEFAULT_USER_AGENT);
    }

    //---------------------------------------------------------------

    /**
     * Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
     * {@code } tag.
     * 
     * 

示例:

* *
* * 比如 * *
    {@code
    
* * * * *
} *
* * 可以将这个字符串 转成 {@link Document} * *
* * @param html * the html * @return 如果 html 是null,抛出 {@link NullPointerException}
* @since 1.13.0 */ public static Document parse(String html){ Validate.notNull(html, "html can't be null!"); return Jsoup.parse(html); } //--------------------------------------------------------------- /** * 通过url 获得文档. * * @param urlString * the url * @param userAgent * the user agent * @return the document * @throws JsoupUtilException * if Exception * @see org.jsoup.Jsoup#connect(String) * @see org.jsoup.Connection#userAgent(String) * @see org.jsoup.Connection#timeout(int) * @see org.jsoup.Connection#get() */ public static Document getDocument(String urlString,String userAgent){ Validate.notBlank(urlString, "urlString can't be blank!"); //since 3.0.10 urlString = UriProcessor.process(urlString, true); //--------------------------------------------------------------- try{ Connection connection = Jsoup.connect(urlString)// .userAgent(userAgent)// .timeout(DEFAULT_TIMEOUT_MILLIS); if (StringUtils.startsWithIgnoreCase(urlString, "https://")){ SSLContext build = SSLContextBuilder.build(SSLProtocol.TLSv12); //since 3.0.7 connection.sslSocketFactory(build.getSocketFactory()); } return connection.get(); }catch (IOException e){ throw new JsoupUtilException(Slf4jUtil.format("urlString:[{}],userAgent:[{}]", urlString, userAgent), e); } } //--------------------------------------------------------------- /** * Gets the elements by select. * * @param url * the url * @param selectQuery * the select query * @return 如果 url 是null,抛出 {@link NullPointerException}
* 如果 url 是blank,抛出 {@link IllegalArgumentException}
* 如果 selectQuery 是null,抛出 {@link NullPointerException}
* 如果 selectQuery 是blank,抛出 {@link IllegalArgumentException}
* @see #getDocument(String) * @see org.jsoup.nodes.Element#select(String) */ public static Elements getElementsBySelect(String url,String selectQuery){ Validate.notBlank(url, "url can't be blank!"); Validate.notBlank(selectQuery, "selectQuery can't be blank!"); Document document = getDocument(url); return document.select(selectQuery); } //--------------------------------------------------------------- /** * Gets the element by id. * * @param url * the url * @param id * the id * @return 如果 url 是null,抛出 {@link NullPointerException}
* 如果 url 是blank,抛出 {@link IllegalArgumentException}
* 如果 id 是null,抛出 {@link NullPointerException}
* 如果 id 是blank,抛出 {@link IllegalArgumentException}
* @see #getDocument(String) * @see org.jsoup.nodes.Element#getElementById(String) */ public static Element getElementById(String url,String id){ Validate.notBlank(url, "url can't be blank!"); Validate.notBlank(id, "id can't be blank!"); Document document = getDocument(url); return document.getElementById(id); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy