All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.syncthemall.diffbot.Diffbot Maven / Gradle / Ivy

/**
 * The MIT License
 * Copyright (c) 2013 Pierre-Denis Vanduynslager
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package com.syncthemall.diffbot;

import static com.syncthemall.diffbot.Constants.FIELDS;
import static com.syncthemall.diffbot.Constants.FORMAT;
import static com.syncthemall.diffbot.Constants.MODE;
import static com.syncthemall.diffbot.Constants.STATS;
import static com.syncthemall.diffbot.Constants.TIMEOUT;
import static com.syncthemall.diffbot.Constants.TOKEN;
import static com.syncthemall.diffbot.Constants.URL;
import static com.syncthemall.diffbot.Constants.USER_AGENT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;

import com.google.api.client.http.HttpExecuteInterceptor;
import com.google.api.client.http.HttpRequest;
import com.google.api.client.http.HttpRequestFactory;
import com.google.api.client.http.HttpRequestInitializer;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.JsonObjectParser;
import com.syncthemall.diffbot.model.Model;
import com.syncthemall.diffbot.model.PageType;
import com.syncthemall.diffbot.model.classifier.Classified;

/**
 * This is the main class of the SDK and allows to call:
 * 
    *
  • The Article API to extract informations (title, images, main content etc...) from an article web page.
  • *
  • The Frontpage API to extract the elements (articles, posts, etc...) from the home page of a website.
  • *
  • The Image API to extract the images from a web page.
  • *
  • The Product API to extract the products from a web page.
  • *
  • The Classifier API that takes any web link and automatically determines what type of page it is.
  • *
  • The batch API allowing execute multiple Article, Frontpage, Image or Product requests at once.
  • *
*

* This class is thread-safe and should be instantiated only once. If the batch API is used this class has * to be instantiated only once as it store the queue of request to execute in a batch. *

* The underlying implementations for the HTTP client (Apache, GAE url fetch, JVM default) and the JSON parser (Gson or * Jackson) can be chosen during construction {@link Diffbot#Diffbot(HttpTransport, JsonFactory, String)}. *

* For example to use Apache and Jackson: * *

 * private static Diffbot diffbot;
 * ...
 * diffbot = new Diffbot(new ApacheHttpTransport(), new JacksonFactory(), "your dev token");
 * 
* * Note: google-http-client-jackson2 has to be in the classpath. *

* The Article API
* Calling the API is as simple as : * *

 * Article article = diffbot.article().analyze("web page URL").withFields("*").execute();
 * 
*

* The Frontpage API
* Calling the API is as simple as : * *

 * Frontpage frontpage = diffbot.frontpage().analyze("web page URL").execute();
 * 
* * The Image API
* Calling the API is as simple as : * *
 * Images images = diffbot.images().analyze("web page URL").withFields("*").execute();
 * 
*

* * The Product API
* Calling the API is as simple as : * *

 * Products products = diffbot.products().analyze("web page URL").withFields("*").execute();
 * 
* * The Classifier API
* Calling the API is as simple as : * *
 * Classified classified = diffbot.classifier().analyze("web page URL").withFields("*").execute();
 * 
* * If the {@code Classified} resulting object is of {@link PageType} {@link PageType#ARTICLE}, {@link PageType#IMAGE} or * {@link PageType#PRODUCT} it can be parsed as respectively an {@link com.syncthemall.diffbot.model.article.Article}, * an {@link com.syncthemall.diffbot.model.images.Images} or a {@link com.syncthemall.diffbot.model.products.Products}: * *
 * if (classified.getType().equals(PageType.ARTICLE)) {
 * 	Article article = classified.asArticle();
 * }
 * 
*

* * The Batch API
* The batch API allows to prepare multiple Article, Frontpage, Image, Product and/or Classifier request and to send * them all at once. The batch API: *

    *
  • limits the number of API calls and reduces the network load, by avoiding http overhead for every request
  • *
  • slightly improve the performances if a lot of requests are executed
  • *
  • limits the usage of the Diffbot quota
  • *
* To prepare requests to be executed in batch: * *
 * Future<Article> fArticle = diffbot.article().analyze("web page URL").queue();
 * Future<Frontpage> fFrontpage = diffbot.frontpage().analyze("web page URL").queue();
 * Future<Images> fImages = diffbot.images().analyze("web page URL").queue();
 * Future<Products> fProducts = diffbot.products().analyze("web page URL").queue();
 * Future<Classified> fClassified = diffbot.classifier().analyze("web page URL").queue();
 * 
*

* Note that this can be done concurrently by multiple threads. The {@code Diffbot} class is fully thread-safe. *

* At this point no call has been done to Diffbot. To obtain the results: * *

 * Article article = fArticle.get();
 * Frontpage frontpage = fFrontpage.get();
 * Images images = fImages.get();
 * Products products = fProducts.get();
 * Classified classified = fClassified.get();
 * 
* * The first line trigger a batch request that retrieves the results for all the requests added since the last call to * {@code Future#get()}. The second line doesn't need to do any API call as the result was retrieve during the execution * of the fist line. * * @author Pierre-Denis Vanduynslager */ public class Diffbot { private ResourceBundle bundle = ResourceBundle.getBundle("com.syncthemall.diffbot.messages.Messages"); private JAXBContext jAXBContext; private JsonFactory jsonFactory; private HttpRequestFactory requestFactory; private List> futures = new ArrayList>(); private int maxBatchRequest = 25; private int batchRequestTimeout = 300000; /** * Constructor allowing to set the Diffbot developer token and the underlying implementations for the HTTP client * and the JSON parser. *

* The HTTP implementation can be: *

    *
  • {@code ApacheHttpTransport}: Apache HTTP client.
  • *
  • {@code NetHttpTransport}: Default JVM HTTP client.
  • *
  • {@code UrlFetchTransport}: URL * Fetch to use with Google App Engine.
  • *
* The JSON parser implementation can be: *
    *
  • {@code GsonFactory}: Gson.
  • *
  • {@code JsonFactory}: Jackson 2.
  • *
* * @param httpTransport an implementation of {@code HttpTransport} * @param jsonFactory an implementation of {@code JsonFactory} * @param token your Diffbot developer token * @throws JAXBException if JAXB is not in the classpath */ public Diffbot(final HttpTransport httpTransport, final JsonFactory jsonFactory, final String token) throws JAXBException { if (token == null || token.isEmpty()) { throw new IllegalArgumentException(bundle.getString("token.empty")); } if (httpTransport == null) { throw new IllegalArgumentException(bundle.getString("transport.null")); } if (jsonFactory == null) { throw new IllegalArgumentException(bundle.getString("jsonfactory.null")); } this.jsonFactory = jsonFactory; jAXBContext = JAXBContext.newInstance(com.syncthemall.diffbot.model.frontpage.Frontpage.class); this.requestFactory = httpTransport.createRequestFactory(new HttpRequestInitializer() { @Override public void initialize(final HttpRequest request) { request.setParser(new JsonObjectParser(jsonFactory)); request.getHeaders().setUserAgent(USER_AGENT); request.setInterceptor(new HttpExecuteInterceptor() { @Override public void intercept(final HttpRequest request) throws IOException { request.getUrl().set(TOKEN, token); } }); } }); } /** * @return the JAXB Context. **/ public final JAXBContext getJAXBContext() { return jAXBContext; } /** * @return the JSON Factory. **/ public final JsonFactory getJsonFactory() { return jsonFactory; } /** * @return the HTTP Request Factory. **/ public final HttpRequestFactory getRequestFactory() { return requestFactory; } /** * @return the list of {@code Future} currently in the batch queue. */ public final List> getFutures() { return futures; } /** * An accessor for creating requests from the {@link Frontpage} collection. * *

* The typical use is: *

* *
	 *   {@code Diffbot diffbot = new Diffboat(...);}
	 *   {@code Diffboat.Frontpage.Analyze request = diffboat.frontpage().analyze(parameters ...)}
	 * 
* * @return the resource collection */ public final Frontpage frontpage() { return new Frontpage(); } /** * The "frontpage" collection of methods. */ public final class Frontpage { /** * Get a frontpage analyzed by Diffbot. * * Create a request for the method "frontpage". * * This request holds the parameters needed by the diffbot server. After setting any optional parameters, call * the {@link Analyze#execute()} method to invoke the remote operation or {@link Analyze#queue()} to add it to * the batch queue. * * @param url URL to extract the {@link Frontpage} from. * @return the request */ public Analyze analyze(final String url) { return new Analyze(url); } /** * Request to the Diffbot Frontpage API. */ public class Analyze extends DiffbotRequest { private static final String FRONTPAGE_URI = "http://www.diffbot.com/api/frontpage"; /** * Creates a {@code DiffbotRequest} to extract the elements of a front page. * * @param url URL to extract the {@link Frontpage} from. */ protected Analyze(final String url) { super(Diffbot.this, com.syncthemall.diffbot.model.frontpage.Frontpage.class, FRONTPAGE_URI); set(URL, url); set(FORMAT, "xml"); } @Override protected final ApiType getApiType() { return ApiType.FRONTPAGE; } } } /** * An accessor for creating requests from the {@link Article} collection. * *

* The typical use is: *

* *
	 *   {@code Diffbot diffbot = new Diffboat(...);}
	 *   {@code Diffboat.Article.Analyze request = diffboat.article().analyze(parameters ...)}
	 * 
* * @return the resource collection */ public final Article article() { return new Article(); } /** * The "articles" collection of methods. */ public final class Article { /** * Get an article analyzed by Diffbot. * * Create a request for the method "article". * * This request holds the parameters needed by the diffbot server. After setting any optional parameters, call * the {@link Analyze#execute()} method to invoke the remote operation or {@link Analyze#queue()} to add it to * the batch queue. * * @param url URL to extract the {@link Article} from. * @return the request */ public Analyze analyze(final String url) { return new Analyze(url); } /** * Request to the Diffbot Article API. */ public class Analyze extends DiffbotRequest { private static final String ARTICLE_URI = "http://api.diffbot.com/v2/article"; /** * Creates a {@code DiffbotRequest} to extract an article of a given URL. * * @param url URL to extract the {@link Article} from. */ protected Analyze(final String url) { super(Diffbot.this, com.syncthemall.diffbot.model.article.Article.class, ARTICLE_URI); set(URL, url); } /** * The fields parameter allows to limit or expand which fields are returned in the response. For nested * arrays, use parentheses to retrieve specific fields, or * to return all sub-fields. *

* For example: meta,links,images(*) * * @param fields information returned by the API * @return the instance of this request * * @see Diffbot - Article API Documentation */ public final Analyze withFields(final String fields) { set(FIELDS, fields); return this; } /** * Overrides the default API timeout of 5000ms. * * @param timeout The timeout in milliseconds * * @return the instance of this request */ public final Analyze withTimeout(final int timeout) { set(TIMEOUT, String.valueOf(timeout)); setReadTimeout(timeout); return this; } @Override protected final ApiType getApiType() { return ApiType.ARTICLE; } } } /** * An accessor for creating requests from the {@link Images} collection. * *

* The typical use is: *

* *
	 *   {@code Diffbot diffbot = new Diffboat(...);}
	 *   {@code Diffboat.Images.Analyze request = diffboat.images().analyze(parameters ...)}
	 * 
* * @return the resource collection */ public final Images images() { return new Images(); } /** * The "images" collection of methods. */ public final class Images { /** * Get images analyzed by Diffbot. * * Create a request for the method "image". * * This request holds the parameters needed by the diffbot server. After setting any optional parameters, call * the {@link Analyze#execute()} method to invoke the remote operation or {@link Analyze#queue()} to add it to * the batch queue. * * @param url URL to extract the {@link Images} from. * @return the request */ public Analyze analyze(final String url) { return new Analyze(url); } /** * Request to the Diffbot Image API. */ public class Analyze extends DiffbotRequest { private static final String IMAGE_URI = "http://api.diffbot.com/v2/image"; /** * Creates a {@code DiffbotRequest} to extract an article of a given URL. * * @param url URL to extract the {@link Images} from. */ protected Analyze(final String url) { super(Diffbot.this, com.syncthemall.diffbot.model.images.Images.class, IMAGE_URI); set(URL, url); } /** * The fields parameter allows to limit or expand which fields are returned in the response. For nested * arrays, use parentheses to retrieve specific fields, or * to return all sub-fields. *

* For example: meta,links,images(*) * * @param fields information returned by the API * @return the instance of this request * * @see Diffbot - Image API Documentation */ public final Analyze withFields(final String fields) { set(FIELDS, fields); return this; } /** * Overrides the default API timeout of 5000ms. * * @param timeout The timeout in milliseconds * * @return the instance of this request */ public final Analyze withTimeout(final int timeout) { set(TIMEOUT, String.valueOf(timeout)); setReadTimeout(timeout); return this; } @Override protected final ApiType getApiType() { return ApiType.IMAGE; } } } /** * An accessor for creating requests from the {@link Products} collection. * *

* The typical use is: *

* *
	 *   {@code Diffbot diffbot = new Diffboat(...);}
	 *   {@code Diffboat.Products.Analyze request = diffboat.products().analyze(parameters ...)}
	 * 
* * @return the resource collection */ public final Products products() { return new Products(); } /** * The "products" collection of methods. */ public final class Products { /** * Get product webpage analyzed by Diffbot. * * Create a request for the method "product". * * This request holds the parameters needed by the diffbot server. After setting any optional parameters, call * the {@link Analyze#execute()} method to invoke the remote operation or {@link Analyze#queue()} to add it to * the batch queue. * * @param url URL to extract the {@link Products} from. * @return the request */ public Analyze analyze(final String url) { return new Analyze(url); } /** * Request to the Diffbot Product API. */ public class Analyze extends DiffbotRequest { private static final String PRODUCT_URI = "http://api.diffbot.com/v2/product"; /** * Creates a {@code DiffbotRequest} to extract an article of a given URL. * * @param url URL to extract the {@link Products} from. */ protected Analyze(final String url) { super(Diffbot.this, com.syncthemall.diffbot.model.products.Products.class, PRODUCT_URI); set(URL, url); } /** * The fields parameter allows to limit or expand which fields are returned in the response. For nested * arrays, use parentheses to retrieve specific fields, or * to return all sub-fields. *

* For example: meta,links,images(*) * * @param fields information returned by the API * @return the instance of this request * * @see Diffbot - Product API Documentation */ public final Analyze withFields(final String fields) { set(FIELDS, fields); return this; } /** * Overrides the default API timeout of 5000ms. * * @param timeout The timeout in milliseconds * * @return the instance of this request */ public final Analyze withTimeout(final int timeout) { set(TIMEOUT, String.valueOf(timeout)); setReadTimeout(timeout); return this; } @Override protected final ApiType getApiType() { return ApiType.PRODUCT; } } } /** * An accessor for creating requests from the {@link Classifier} collection. * *

* The typical use is: *

* *
	 *   {@code Diffbot diffbot = new Diffboat(...);}
	 *   {@code Diffboat.Classifiers.Analyze request = diffboat.classifier().analyze(parameters ...)}
	 * 
* * @return the resource collection */ public final Classifier classifier() { return new Classifier(); } /** * The "classifier" collection of methods. */ public final class Classifier { /** * Get product webpage analyzed by Diffbot. * * Create a request for the method "product". * * This request holds the parameters needed by the diffbot server. After setting any optional parameters, call * the {@link Analyze#execute()} method to invoke the remote operation or {@link Analyze#queue()} to add it to * the batch queue. * * @param url URL to extract the {@link Classified} from. * @return the request */ public Analyze analyze(final String url) { return new Analyze(url); } /** * Request to the Diffbot Product API. */ public class Analyze extends DiffbotRequest { private static final String CLASSIFIER_URI = "http://api.diffbot.com/v2/analyze"; /** * Creates a {@code DiffbotRequest} to extract an article of a given URL. * * @param url URL to extract the {@link Products} from. */ protected Analyze(final String url) { super(Diffbot.this, com.syncthemall.diffbot.model.classifier.Classified.class, CLASSIFIER_URI); set(URL, url); } /** * The fields parameter allows to limit or expand which fields are returned in the response. For nested * arrays, use parentheses to retrieve specific fields, or * to return all sub-fields. *

* For example: meta,links,images(*) * * @param fields information returned by the API * @return the instance of this request * * @see Diffbot - Product API Documentation */ public final Analyze withFields(final String fields) { set(FIELDS, fields); return this; } /** * Overrides the default API timeout of 5000ms. * * @param timeout The timeout in milliseconds * * @return the instance of this request */ public final Analyze withTimeout(final int timeout) { set(TIMEOUT, String.valueOf(timeout)); setReadTimeout(timeout); return this; } /** * By default the Page Classifier API will fully extract pages that match an existing Diffbot Automatic API. * Set mode to a specific {@code PageType} to extract content only from that particular {@code PageType}. * All others will simply return the page classification information. * * @param type the {@code PageType} for which to extract content * * @return the instance of this request */ public final Analyze withMode(final PageType type) { set(MODE, type.getKey()); return this; } /** * Returns statistics on page classification and extraction, including an array of individual page-types and * the Diffbot-determined score (likelihood) for each type. * * @return the instance of this request */ public final Analyze withStats() { set(STATS, ""); return this; } @Override protected final ApiType getApiType() { return ApiType.CLASSIFIER; } } } /** * @return the maximum number of request send in one batch */ public final int getMaxBatchRequest() { return maxBatchRequest; } /** * @param maxBatchRequest maximum number of requests send in one batch (minimum 1, maximum 50, default 25) */ public final void setMaxBatchRequest(final int maxBatchRequest) { if (maxBatchRequest <= 0 || maxBatchRequest > 50) { throw new IllegalArgumentException(bundle.getString("max.batch.request")); } this.maxBatchRequest = maxBatchRequest; } /** * @return the timeout of a batch request */ public final int getBatchRequestTimeout() { return batchRequestTimeout; } /** * @param batchRequestTimeout the timeout of a batch request (the default value is 5 minutes) or {@code 0} for an * infinite timeout */ public final void setBatchRequestTimeout(final int batchRequestTimeout) { this.batchRequestTimeout = batchRequestTimeout; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy