All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gobblin.ingestion.google.webmaster.UrlTriePostOrderIterator Maven / Gradle / Ivy

package gobblin.ingestion.google.webmaster;

import com.google.api.client.repackaged.com.google.common.base.Preconditions;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import org.apache.commons.lang3.tuple.Pair;


/**
 * This is a post-order iterator that traverses the nodes on the URL trie with a stopping rule, which is, it will not go deeper into the nodes whose size(defined as the number of descendant URLs and itself if itself is a URL page) is less than or equal to the stopping size. In other words, those nodes with size less than or equal to the stopping size will be treated as leaf nodes.
 *
 * Iteration value:
 * Pair.1 is the full path to current node.
 * Pair.2 is current node.
 */
public class UrlTriePostOrderIterator implements Iterator> {

  private final int _groupSize;
  private final StringBuilder _currentPrefixSb;
  private Deque _unprocessed = new ArrayDeque<>();
  private UrlTrieNode _currentNode;
  private UrlTrieNode _lastVisited = null;
  private UrlTrieNode _toReturn;

  public UrlTriePostOrderIterator(UrlTrie trie, int stoppingSize) {
    Preconditions.checkArgument(stoppingSize > 0);
    _currentNode = trie.getRoot();
    String prefix = trie.getPrefix();
    _currentPrefixSb = new StringBuilder();
    if (prefix != null) {
      _currentPrefixSb.append(prefix);
    }
    _groupSize = stoppingSize;
  }

  @Override
  public boolean hasNext() {
    if (_toReturn != null) {
      return true;
    }

    while (!_unprocessed.isEmpty() || !isStoppingNode(_currentNode)) {
      if (!isStoppingNode(_currentNode)) {
        //keep going down if not at leaf
        _unprocessed.push(_currentNode);
        _currentPrefixSb.append(_currentNode.getValue());

        Map.Entry next = _currentNode.children.firstEntry();
        if (next == null) {
          _currentNode = null;
        } else {
          _currentNode = next.getValue();
        }
      } else {

        UrlTrieNode peekNode = _unprocessed.peek();
        if (_currentNode != null || peekNode.children.isEmpty()
            || peekNode.children.lastEntry().getValue() == _lastVisited) {
          //_currentNode is a returnable stopping node
          if (_currentNode != null) {
            _toReturn = _currentNode;
          } else {
            _toReturn = _unprocessed.pop();
            _currentPrefixSb.setLength(_currentPrefixSb.length() - 1);
          }

          //If there is no parent, it's the last one; otherwise, move to right
          UrlTrieNode parent = _unprocessed.peek();
          if (parent == null) {
            return true; //we've got the last one.
          }
          //move to the right sibling. Set to null, if there is no right sibling.
          Map.Entry sibling = parent.children.higherEntry(_toReturn.getValue());
          if (sibling == null) {
            _currentNode = null;
          } else {
            _currentNode = sibling.getValue();
          }

          return true;
        } else {
          //hand over to the next loop to move right
          _currentNode = peekNode;
        }
      }
    }

    //This case happens when the whole trie has fewer URLs than the group size
    if (_lastVisited == null && _currentNode.getSize() > 0) {
      //_currentNode is now at the root node, which is a leaf by the iterator's definition
      _toReturn = _currentNode;
      return true;
    }
    return false;
  }

  /**
   * A node is a stopping node, from which you cannot go deeper, if
   *   1. this node is null
   *   2. this node has descendants <= groupSize, but this node is returnable
   */
  private boolean isStoppingNode(UrlTrieNode node) {
    return node == null || node.getSize() <= _groupSize;
  }

  @Override
  public Pair next() {
    if (hasNext()) {
      _lastVisited = _toReturn;
      _toReturn = null;
      return Pair.of(_currentPrefixSb.toString() + _lastVisited.getValue(), _lastVisited);
    }
    throw new NoSuchElementException();
  }

  @Override
  public void remove() {
    throw new UnsupportedOperationException();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy