com.sangupta.blogparser.blogger.BloggerParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of blog-parser Show documentation
Java library to parse blog exports into strongly-typed data object model
The newest version!
/**
 *
 * BlogParser - Parsing library for Blog exports 
 * Copyright (c) 2012, Sandeep Gupta
 * 
 * http://www.sangupta/projects/blog-parser
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * 		http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.sangupta.blogparser.blogger;

import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jdom.Element;

import com.sangupta.blogparser.Parser;
import com.sangupta.blogparser.domain.Author;
import com.sangupta.blogparser.domain.Blog;
import com.sangupta.blogparser.domain.BlogPage;
import com.sangupta.blogparser.domain.BlogPost;
import com.sangupta.blogparser.domain.PostComment;
import com.sun.syndication.feed.synd.SyndCategoryImpl;
import com.sun.syndication.feed.synd.SyndContentImpl;
import com.sun.syndication.feed.synd.SyndEntryImpl;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndPersonImpl;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;

/**
 * Implementation to parse a Blogger XML export of a blog.
 * 
 * @author sangupta
 * @since 1.0
 */
public class BloggerParser implements Parser {

	/**
	 * Parse the XML feed and return the {@link Blog} object.
	 */
	public Blog parse(String blogData) {
		return parse(new StringReader(blogData));
	}
	
	/**
	 * Parse the feed from the given reader and return the {@link Blog} object.
	 * 
	 * @param reader the reader to use for reading contents of the blog export
	 * 
	 * @throws IllegalArgumentException if the reader supplied is null
	 */
	public Blog parse(Reader reader) {
		if(reader == null) {
			throw new IllegalArgumentException("Reader cannot be null.");
		}
		
		SyndFeed feed = null;
		try {
			feed = new SyndFeedInput().build(reader);
		} catch (IllegalArgumentException e) {
			throw new RuntimeException("Illegal arguments when parsing feed", e);
		} catch (FeedException e) {
			throw new RuntimeException("Unable to parse feed", e);
		}
		
		Blog blog = new Blog();
		blog.setUrl(feed.getLink());
		blog.setTitle(feed.getTitleEx().getValue());
		
		// author info
		// email, name, profile url
		SyndPersonImpl blogAuthor = ((SyndPersonImpl) feed.getAuthors().get(0));
		Author author = new Author();
		author.setName(blogAuthor.getName());
		author.setEmail(blogAuthor.getEmail());
		author.setProfileUrl(blogAuthor.getUri());
		
		blog.addAuthor(author);
		
		// create a map to store posts per URL
		// this will be used to store comments
		Map posts = new HashMap();
		
		// get each individual entry
		for(Object obj : feed.getEntries()) {
			SyndEntryImpl entry = (SyndEntryImpl) obj;
			
			SyndCategoryImpl category = (SyndCategoryImpl) entry.getCategories().get(0);

			// blog description
			if(entry.getUri().endsWith(".settings.BLOG_DESCRIPTION") && category.getTaxonomyUri().equals("http://schemas.google.com/g/2005#kind") && category.getName().equals("http://schemas.google.com/blogger/2008/kind#settings")) {
				blog.setDescription(((SyndContentImpl) entry.getContents().get(0)).getValue());
			}
			
			// post
			if(category.getTaxonomyUri().equals("http://schemas.google.com/g/2005#kind") && category.getName().equals("http://schemas.google.com/blogger/2008/kind#post")) {
				// do as a proper post
				BlogPost post = convertToBlogPost(entry);
				blog.addPost(post);
				posts.put(post.getUrl(), post);
				continue;
			}
			
			// comment
			if(category.getTaxonomyUri().equals("http://schemas.google.com/g/2005#kind") && category.getName().equals("http://schemas.google.com/blogger/2008/kind#comment")) {
				PostComment comment = convertToPostComment(entry);
				BlogPost post = posts.get(comment.getPostUrl());
				if(post != null) {
					post.addComment(comment);
				}
				continue;
			}
			
			// page
			if(category.getTaxonomyUri().equals("http://schemas.google.com/g/2005#kind") && category.getName().equals("http://schemas.google.com/blogger/2008/kind#page")) {
				// do as a proper post
				BlogPage page = convertToBlogPage(entry);
				blog.addPage(page);
				continue;
			}
		}
		
		return blog;
	}
	
	/**
	 * Convert the syndicated feed entry to the {@link BlogPage} object.
	 * 
	 * @param entry
	 * @return
	 */
	private BlogPage convertToBlogPage(SyndEntryImpl entry) {
		BlogPage page = new BlogPage();
		
		page.setContent(((SyndContentImpl) entry.getContents().get(0)).getValue());
		page.setPublishedOn(entry.getPublishedDate());
		page.setUrl(entry.getLink());
		page.setTitle(entry.getTitleEx().getValue());
		
		// extract the author
		SyndPersonImpl blogAuthor = ((SyndPersonImpl) entry.getAuthors().get(0));
		Author author = new Author();
		author.setName(blogAuthor.getName());
		author.setEmail(blogAuthor.getEmail());
		author.setProfileUrl(blogAuthor.getUri());
		
		page.setAuthor(author);
		
		return page;

	}

	/**
	 * Convert the syndicated feed entry to the {@link BlogPost} object.
	 * 
	 * @param entry
	 * @return
	 */
	private BlogPost convertToBlogPost(SyndEntryImpl entry) {
		BlogPost post = new BlogPost();
		
		post.setContent(((SyndContentImpl) entry.getContents().get(0)).getValue());
		post.setPublishedOn(entry.getPublishedDate());
		post.setUrl(entry.getLink());
		post.setTitle(entry.getTitleEx().getValue());
		
		// extract the author
		SyndPersonImpl blogAuthor = ((SyndPersonImpl) entry.getAuthors().get(0));
		Author author = new Author();
		author.setName(blogAuthor.getName());
		author.setEmail(blogAuthor.getEmail());
		author.setProfileUrl(blogAuthor.getUri());
		
		post.setAuthor(author);
		
		// extract the categories
		for(Object obj : entry.getCategories()) {
			SyndCategoryImpl category = (SyndCategoryImpl) obj;
			if(category.getTaxonomyUri().equals("http://www.blogger.com/atom/ns#")) {
				post.addTag(category.getName());
			}
		}
		
		return post;
	}

	/**
	 * Convert the syndicated feed entry to the {@link PostComment} object.
	 * 
	 * @param entry
	 * @return
	 */
	private PostComment convertToPostComment(SyndEntryImpl entry) {
		PostComment comment = new PostComment();
		
		comment.setText(((SyndContentImpl) entry.getContents().get(0)).getValue());
		comment.setPublishedOn(entry.getPublishedDate());
		
		// extract the author
		SyndPersonImpl blogAuthor = ((SyndPersonImpl) entry.getAuthors().get(0));
		Author author = new Author();
		author.setName(blogAuthor.getName());
		author.setEmail(blogAuthor.getEmail());
		author.setProfileUrl(blogAuthor.getUri());
		
		comment.setAuthor(author);
		
		// find the url of the post
		@SuppressWarnings("unchecked")
		List elements = (List) entry.getForeignMarkup();
		for(Element element : elements) {
			if(element.getName().equals("in-reply-to")) {
				String url = element.getAttributeValue("href");
				comment.setPostUrl(url);
				break;
			}
		}
		
		return comment;
	}

}