All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opengraph.OpenGraph Maven / Gradle / Ivy

package org.opengraph;

import java.net.HttpCookie;
import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpMethod;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;
import org.springframework.web.client.RestTemplate;
import org.springframework.web.util.HtmlUtils;
import org.springframework.web.util.UriComponentsBuilder;

/**
 * 
 * http://shopping.interpark.com/product/productInfo.do?prdNo=4690768866&dispNo=016001
 * http://goo.gl/fJLWja
 * http://www.g9.co.kr/Display/VIP/Index/693661469?jaehuid=200007334&NaPm=ct%3Dizi2561k%7Cci%3D9cb0c51ad0cfdc43896816e9b948e9e2390f7d28%7Ctr%3Dslsl%7Csn%3D280455%7Chk%3D875c6ad1b5a326aadb31b1b88228743ad464b374
 * http://blog.naver.com/mrj_bundang
 * http://wook.cloudapp.net
 * http://google.com
 * http://naver.me/GtlkiRH0
 * 
*/ public class OpenGraph { private String[] removes = { HttpHeaders.COOKIE, HttpHeaders.ACCEPT_ENCODING }; private List accepts = Arrays.asList(MediaType.TEXT_HTML); private final Document document; private final HttpHeaders headers; private final ResponseEntity response; private final RestTemplate restTemplate; private URI uri; private int count; public OpenGraph(RestTemplate restTemplate, URI uri, HttpHeaders headers) { this(restTemplate, uri, headers, 5); } public OpenGraph(RestTemplate restTemplate, URI uri, HttpHeaders headers, int max) { if (!StringUtils.hasText(uri.getPath())) { uri = UriComponentsBuilder.fromUri(uri).replacePath("/").build().toUri(); } this.uri = uri; for (String remove : this.removes) { headers.remove(remove); } headers.setAccept(this.accepts); this.headers = headers; this.count = 0; this.restTemplate = restTemplate; this.response = getResponse(uri, max, true); this.document = Jsoup.isValid(this.response.getBody(), Whitelist.none()) ? Jsoup.parse("") : Jsoup.parse(this.response.getBody()); } private ResponseEntity getResponseEntity(URI uri) { this.uri = uri; this.count++; ResponseEntity responseEntity = this.restTemplate.exchange(uri, HttpMethod.GET, new HttpEntity(null, this.headers), String.class); setCookie(responseEntity.getHeaders().get(HttpHeaders.SET_COOKIE)); return responseEntity; } private void setCookie(List cookies) { if (!CollectionUtils.isEmpty(cookies)) { Set httpCookies = new LinkedHashSet(); for (String cookie : cookies) { for (HttpCookie httpCookie : HttpCookie.parse(cookie)) { httpCookie.setDomain(null); httpCookie.setPath(null); httpCookies.add(httpCookie); } } cookies = this.headers.get(HttpHeaders.COOKIE); if (!CollectionUtils.isEmpty(cookies)) { for (String cookie : cookies) { int index = cookie.indexOf('='); if (index != -1) { httpCookies.add(new HttpCookie(cookie.substring(0, index), cookie.substring(index + 1, cookie.length()))); } } } this.headers.remove(HttpHeaders.COOKIE); for (HttpCookie httpCookie : httpCookies) { if (StringUtils.hasText(httpCookie.getValue())) { this.headers.add(HttpHeaders.COOKIE, httpCookie.getName() + '=' + httpCookie.getValue()); } } } } /** * @see org.springframework.util.StringUtils#trimAllWhitespace(String) */ private ResponseEntity getResponse(URI uri, int max, boolean recursive) { ResponseEntity response = getResponseEntity(uri); if (this.count > max) { return response; } while (response.getHeaders().getLocation() != null) { uri = getValidPath(response.getHeaders().getLocation().toString()); response = getResponseEntity(uri); } if (recursive && !Jsoup.isValid(response.getBody(), Whitelist.none())) { Document document = Jsoup.parse(response.getBody()); // if have , it will call recursive for (Element element : document.getElementsByAttributeValue("property", "og:url")) { if (element.hasAttr("content")) { String text = element.attr("content"); URI url = getValidPath(text); if (StringUtils.hasText(text) && !uri.toString().equals(text)) { ResponseEntity responseEntity = getResponse(url, max, true); if (responseEntity.getStatusCode().is2xxSuccessful()) { return responseEntity; } } } } // if have , it will call recursive for (Element element : document.getElementsByTag("frame")) { if (element.hasAttr("src")) { String text = element.attr("src"); URI url = getValidPath(text); if (StringUtils.hasText(text)) { ResponseEntity responseEntity = getResponse(url, max, false); if (responseEntity.getStatusCode().is2xxSuccessful()) { return responseEntity; } } } } } return response; } public String getContent(String value) { // Elements elements = this.document.getElementsByAttributeValue("property", value); if (elements.hasAttr("content")) { String text = elements.attr("content"); if (StringUtils.hasText(text)) { return text; } } return null; } public URI getImage() { // Elements elements = this.document.getElementsByAttributeValue("property", "og:image"); if (elements.hasAttr("content")) { String text = elements.attr("content"); if (StringUtils.hasText(text)) { return getValidPath(text); } } for (Element element : this.document.head().select("link[href~=.*\\.(ico|png)]")) { if (element.hasAttr("href")) { String text = element.attr("href"); if (StringUtils.hasText(text)) { return getValidPath(text); } } } for (Element element : this.document.head().select("meta[itemprop=image]")) { if (element.hasAttr("content")) { String text = element.attr("content"); if (StringUtils.hasText(text)) { return getValidPath(text); } } } // 2nd -> img in div for (Element element : this.document.getElementsByTag("div")) { if (element.children().size() > 0) { element = element.child(0); if (element.tagName().equals("img")) { if (element.hasAttr("width")) { String text = element.attr("src"); if (StringUtils.hasText(text)) return getValidPath(text); } } } } // 2nd -> img in p for (Element element : this.document.getElementsByTag("p")) { for (Element elementTag : element.getElementsByTag("img")) { if (elementTag.hasAttr("src")) { String text = elementTag.attr("src"); if (StringUtils.hasText(text)) return getValidPath(text); } } } // 2nd -> img in dd for (Element element : this.document.getElementsByTag("dd")) { for (Element elementTag : element.getElementsByTag("img")) { if (elementTag.hasAttr("src")) { String text = elementTag.attr("src"); if (StringUtils.hasText(text)) return getValidPath(text); } } } // 3rd -> img in html for (Element element : this.document.getElementsByTag("img")) { if (element.hasAttr("src")) { String text = element.attr("src"); if (StringUtils.hasText(text)) return getValidPath(text); } } // etc empty return null; } private URI getValidPath(String url) { URI uri; if (isAbsoluteUrl(url)) { uri = UriComponentsBuilder.fromUriString(url).build().toUri(); } else { uri = this.uri.resolve(url); } if (!StringUtils.hasText(uri.getPath())) { uri = UriComponentsBuilder.fromUri(uri).replacePath("/").build().toUri(); } return uri; } private boolean isAbsoluteUrl(String url) { if (url == null) { return false; } final Pattern ABSOLUTE_URL = Pattern.compile("\\A[a-z0-9.+-]+://.*", Pattern.CASE_INSENSITIVE); return ABSOLUTE_URL.matcher(url).matches(); } public String getTitle() { // Elements elements = this.document.getElementsByAttributeValue("property", "og:title"); if (elements.hasAttr("content")) { String text = elements.attr("content"); if (StringUtils.hasText(text)) { return text; } } // elements = this.document.getElementsByAttributeValue("name", "title"); if (elements.hasAttr("content")) { String text = elements.attr("content"); if (StringUtils.hasText(text)) { return text; } } // * String title = this.document.title(); if (StringUtils.hasText(title)) { return title; } return null; } public String getDescription() { // Elements elements = this.document.getElementsByAttributeValue("property", "og:description"); if (elements.hasAttr("content")) { String text = elements.attr("content"); if (StringUtils.hasText(text)) { return text; } } // elements = this.document.getElementsByAttributeValue("name", "description"); if (elements.hasAttr("content")) { String text = elements.attr("content"); if (StringUtils.hasText(text)) { return text; } } //

*

for (Element element : this.document.getElementsByTag("p")) { if (element.hasText() && StringUtils.hasText(element.text())) { return element.text(); } } //
*
for (Element element : this.document.getElementsByTag("div")) { if (element.hasText() && StringUtils.hasText(element.text())) { return element.text(); } } // return empty return null; } public URI getUri() { return this.uri; } public HttpStatus getStatus() { return this.response.getStatusCode().is4xxClientError() || this.response.getStatusCode().is5xxServerError() ? HttpStatus.NOT_FOUND : this.response.getStatusCode(); } public Map getMap() { Map map = new LinkedHashMap(); String title = getTitle(); map.put("title", title == null ? this.uri : HtmlUtils.htmlUnescape(title)); String url = getContent("og:url"); map.put("url", url == null ? this.uri : HtmlUtils.htmlUnescape(url)); String description = getDescription(); map.put("description", description == null ? null : HtmlUtils.htmlUnescape(description)); map.put("type", getContent("og:type")); map.put("site_name", getContent("og:site_name")); URI image = getImage(); Map imageMap; if (image == null) { imageMap = Collections.singletonMap("url", UriComponentsBuilder.fromUri(this.uri).replacePath("favicon.ico").replaceQuery("").build().toUri()); } else { imageMap = new LinkedHashMap(); imageMap.put("url", image); imageMap.put("width", getContent("og:image:width")); imageMap.put("height", getContent("og:image:height")); } map.put("image", imageMap); return map; } }