All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.gerrit.server.mail.receive.HtmlParser Maven / Gradle / Ivy

There is a newer version: 3.11.0
Show newest version
// Copyright (C) 2016 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.gerrit.server.mail.receive;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import com.google.gerrit.reviewdb.client.Comment;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/** Provides functionality for parsing the HTML part of a {@link MailMessage}. */
public class HtmlParser {

  private static final ImmutableList MAIL_PROVIDER_EXTRAS =
      ImmutableList.of(
          "gmail_extra", // "On 01/01/2017 User wrote:"
          "gmail_quote" // Used for quoting original content
          );

  private HtmlParser() {}

  /**
   * Parses comments from html email.
   *
   * 

This parser goes though all html elements in the email and checks for matching patterns. It * keeps track of the last file and comments it encountered to know in which context a parsed * comment belongs. It uses the href attributes of tags to identify comments sent out by * Gerrit as these are generally more reliable then the text captions. * * @param email the message as received from the email service * @param comments a specific set of comments as sent out in the original notification email. * Comments are expected to be in the same order as they were sent out to in the email. * @param changeUrl canonical change URL that points to the change on this Gerrit instance. * Example: https://go-review.googlesource.com/#/c/91570 * @return list of MailComments parsed from the html part of the email */ public static List parse( MailMessage email, Collection comments, String changeUrl) { // TODO(hiesel) Add support for Gmail Mobile // TODO(hiesel) Add tests for other popular email clients // This parser goes though all html elements in the email and checks for // matching patterns. It keeps track of the last file and comments it // encountered to know in which context a parsed comment belongs. // It uses the href attributes of tags to identify comments sent out by // Gerrit as these are generally more reliable then the text captions. List parsedComments = new ArrayList<>(); Document d = Jsoup.parse(email.htmlContent()); PeekingIterator iter = Iterators.peekingIterator(comments.iterator()); String lastEncounteredFileName = null; Comment lastEncounteredComment = null; for (Element e : d.body().getAllElements()) { String elementName = e.tagName(); boolean isInBlockQuote = e.parents().stream().filter(p -> p.tagName().equals("blockquote")).findAny().isPresent(); if (elementName.equals("a")) { String href = e.attr("href"); // Check if there is still a next comment that could be contained in // this tag if (!iter.hasNext()) { continue; } Comment perspectiveComment = iter.peek(); if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) { if (lastEncounteredFileName == null || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) { // Not a file-level comment, but users could have typed a comment // right after this file annotation to create a new file-level // comment. If this file has a file-level comment, we have already // set lastEncounteredComment to that file-level comment when we // encountered the file link and should not reset it now. lastEncounteredFileName = perspectiveComment.key.filename; lastEncounteredComment = null; } else if (perspectiveComment.lineNbr == 0) { // This was originally a file-level comment lastEncounteredComment = perspectiveComment; iter.next(); } } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) { // This is a regular inline comment lastEncounteredComment = perspectiveComment; iter.next(); } } else if (!isInBlockQuote && elementName.equals("div") && !MAIL_PROVIDER_EXTRAS.contains(e.className())) { // This is a comment typed by the user // Replace non-breaking spaces and trim string String content = e.ownText().replace('\u00a0', ' ').trim(); if (!Strings.isNullOrEmpty(content)) { if (lastEncounteredComment == null && lastEncounteredFileName == null) { // Remove quotation line, email signature and // "Sent from my xyz device" content = ParserUtil.trimQuotation(content); // TODO(hiesel) Add more sanitizer if (!Strings.isNullOrEmpty(content)) { ParserUtil.appendOrAddNewComment( new MailComment(content, null, null, MailComment.CommentType.CHANGE_MESSAGE), parsedComments); } } else if (lastEncounteredComment == null) { ParserUtil.appendOrAddNewComment( new MailComment( content, lastEncounteredFileName, null, MailComment.CommentType.FILE_COMMENT), parsedComments); } else { ParserUtil.appendOrAddNewComment( new MailComment( content, null, lastEncounteredComment, MailComment.CommentType.INLINE_COMMENT), parsedComments); } } } } return parsedComments; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy