All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.dreamlu.mica.spider.mapper.DomMapper Maven / Gradle / Ivy

There is a newer version: 3.3.2
Show newest version
/*
 * Copyright (c) 2019-2029, Dreamlu ([email protected] & www.dreamlu.net).
 * 

* Licensed under the GNU LESSER GENERAL PUBLIC LICENSE 3.0; * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

* http://www.gnu.org/licenses/lgpl.html *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.dreamlu.mica.spider.mapper; import net.dreamlu.mica.core.utils.Exceptions; import net.dreamlu.mica.http.ResponseSpec; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Parser; import org.jsoup.select.Elements; import org.springframework.cglib.proxy.Enhancer; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; /** * 爬虫 xml 转 bean 基于 jsoup * * @author L.cm */ public class DomMapper { /** * Returns body to jsoup Document. * * @return Document */ public static Document asDocument(ResponseSpec response) { return readDocument(response.asString()); } /** * 将流读取为 jsoup Document * * @param inputStream InputStream * @return Document */ public static Document readDocument(InputStream inputStream) { try { return DataUtil.load(inputStream, StandardCharsets.UTF_8.name(), ""); } catch (IOException e) { throw Exceptions.unchecked(e); } } /** * 将 html 字符串读取为 jsoup Document * * @param html String * @return Document */ public static Document readDocument(String html) { return Parser.parse(html, ""); } /** * 读取 xml 信息为 java Bean * * @param response ResponseSpec * @param clazz bean Class * @param 泛型 * @return 对象 */ public static T readValue(ResponseSpec response, final Class clazz) { return readValue(response.asStream(), clazz); } /** * 读取 xml 信息为 java Bean * * @param inputStream InputStream * @param clazz bean Class * @param 泛型 * @return 对象 */ public static T readValue(InputStream inputStream, final Class clazz) { return readValue(readDocument(inputStream), clazz); } /** * 读取 xml 信息为 java Bean * * @param html html String * @param clazz bean Class * @param 泛型 * @return 对象 */ public static T readValue(String html, final Class clazz) { return readValue(readDocument(html), clazz); } /** * 读取 xml 信息为 java Bean * * @param doc xml element * @param clazz bean Class * @param 泛型 * @return 对象 */ @SuppressWarnings("unchecked") public static T readValue(final Element doc, final Class clazz) { Enhancer enhancer = new Enhancer(); enhancer.setSuperclass(clazz); enhancer.setUseCache(true); enhancer.setContextClass(clazz); enhancer.setCallback(new CssQueryMethodInterceptor(clazz, doc)); return (T) enhancer.create(); } /** * 读取 xml 信息为 java Bean * * @param 泛型 * @param response ResponseSpec * @param clazz bean Class * @return 对象 */ public static List readList(ResponseSpec response, final Class clazz) { return readList(response.asStream(), clazz); } /** * 读取 xml 信息为 java Bean * * @param 泛型 * @param inputStream InputStream * @param clazz bean Class * @return 对象 */ public static List readList(InputStream inputStream, final Class clazz) { return readList(readDocument(inputStream), clazz); } /** * 读取 xml 信息为 java Bean * * @param 泛型 * @param html html String * @param clazz bean Class * @return 对象 */ public static List readList(String html, final Class clazz) { return readList(readDocument(html), clazz); } /** * 读取 xml 信息为 java Bean * * @param doc xml element * @param clazz bean Class * @param 泛型 * @return 对象列表 */ public static List readList(Element doc, Class clazz) { CssQuery annotation = clazz.getAnnotation(CssQuery.class); if (annotation == null) { throw new IllegalArgumentException("DomMapper readList " + clazz + " mast has annotation @CssQuery."); } String cssQueryValue = annotation.value(); Elements elements = doc.select(cssQueryValue); List valueList = new ArrayList<>(); for (Element element : elements) { valueList.add(readValue(element, clazz)); } return valueList; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy