All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.credibledoc.log.labelizer.crawler.Crawler Maven / Gradle / Ivy

package com.credibledoc.log.labelizer.crawler;

import com.credibledoc.log.labelizer.config.Config;
import com.credibledoc.log.labelizer.exception.LabelizerRuntimeException;
import com.credibledoc.log.labelizer.github.GithubRepo;
import com.credibledoc.log.labelizer.github.GithubRepoRepository;
import com.credibledoc.log.labelizer.github.VisitedUrl;
import com.credibledoc.log.labelizer.github.VisitedUrlRepository;
import com.credibledoc.log.labelizer.pagepattern.PagePattern;
import com.credibledoc.log.labelizer.pagepattern.PagePatternRepository;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import dev.morphia.query.internal.MorphiaCursor;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * Helps to collect date and time patterns for a network training data.
 * 
 * @author Kyrylo Semenko
 */
public class Crawler {
    private static final Logger logger = LoggerFactory.getLogger(Crawler.class);
    private static final String HTTPS_API_GITHUB_COM = "https://api.github.com";
    private static final String SEARCH_REPOSITORIES_Q = "/search/repositories?q=";
    private static final String LANGUAGE_JAVA = "+language%3Ajava?";
    private static final String PER_PAGE = "&per_page=";
    private static final int NUM_ITEMS_PER_PAGE_100 = 100;

    /**
     * Due to terms and conditions up to 10 requests per minute. See the https://developer.github.com/v3/search page.
     */
    private static final int DELAY_BETWEEN_REQUESTS_SECONDS_2 = 2;
    private static final int MAX_THREADS_25 = 25;
    private static final String FIELD_FULL_NAME = "full_name";
    private static final String FIELD_LANGUAGE = "language";
    private static final String JAVA = "Java";
    private static final int MAX_QUERY_LENGTH_256 = 256;
    private static final String PLACEHOLDER = "##placeholder##";
    private static final int DIGITS_IN_PAGE_NUM = 2;

    public static void main(String[] args) {
        PagePatternRepository pagePatternRepository = PagePatternRepository.getInstance();
        logger.info("Context path: '{}'", new File("").getAbsolutePath());
        logger.info("Repository created. HashCode: {}", pagePatternRepository.hashCode());
        Crawler crawler = new Crawler();
        crawler.startJobs();
    }

    private void startJobs() {
        try {
            findRepositories();
            GithubRepoRepository githubRepoRepository = GithubRepoRepository.getInstance();
            List githubRepos = githubRepoRepository.selectNotVisited();
            StringBuilder stringBuilder = new StringBuilder(MAX_QUERY_LENGTH_256);
            String mmInFile = "https://api.github.com/search/code?q=mm+in%3Afile";
            String searchString = mmInFile +
                PLACEHOLDER +
                PER_PAGE +
                NUM_ITEMS_PER_PAGE_100 +
                "&page=";
            List visitedRepos = new ArrayList<>();
            for (GithubRepo githubRepo : githubRepos) {
                searchLinksInRepository(stringBuilder, searchString, visitedRepos, githubRepo);
            }
            execute();
        } catch (Exception e) {
            throw new LabelizerRuntimeException(e);
        }
    }

    private void searchLinksInRepository(StringBuilder stringBuilder, String searchString, List visitedRepos, GithubRepo githubRepo) {
        VisitedUrlRepository visitedUrlRepository = VisitedUrlRepository.getInstance();
        GithubRepoRepository githubRepoRepository = GithubRepoRepository.getInstance();
        String repoName = "+repo:" + githubRepo.getFullName();
        if (searchString.length() - PLACEHOLDER.length() + stringBuilder.length() + repoName.length() +
                    DIGITS_IN_PAGE_NUM <= MAX_QUERY_LENGTH_256) {
            stringBuilder.append(repoName);
            githubRepo.setVisited(true);
            visitedRepos.add(githubRepo);
        } else {
            int page = 1;
            String longSearchString = searchString.replace(PLACEHOLDER, stringBuilder);
            stringBuilder.setLength(0);

            JsonObject searchResults = savePages(page, longSearchString);

            int numResults = searchResults.getAsJsonPrimitive("total_count").getAsInt();
            int numPages = numResults / NUM_ITEMS_PER_PAGE_100;
            for (int nextPage = 2; nextPage <= numPages && nextPage < 11; nextPage++) {
                if (!visitedUrlRepository.contains(longSearchString + nextPage)) {
                    savePages(nextPage, longSearchString);
                }
            }
            githubRepoRepository.save(visitedRepos);
            visitedRepos.clear();
            stringBuilder.append(repoName);
            githubRepo.setVisited(true);
            visitedRepos.add(githubRepo);
        }
    }

    private void findRepositories() throws IOException {
        VisitedUrlRepository visitedUrlRepository = VisitedUrlRepository.getInstance();
        for (String keyword : Config.getGithubSearchKeywords()) {
            // Github provides max 1000 results
            for (int page = 1; page <= 10; page++) {
                String nextPageUrl = HTTPS_API_GITHUB_COM + SEARCH_REPOSITORIES_Q + keyword + LANGUAGE_JAVA +
                    PER_PAGE + NUM_ITEMS_PER_PAGE_100 + "&page=" + page;
                if (!visitedUrlRepository.contains(nextPageUrl)) {
                    JsonObject repoList = getNextRepoList(nextPageUrl);
                    JsonArray items = repoList.getAsJsonArray("items");
                    for (JsonElement nextElement : items) {
                        getAndSaveJavaRepos(nextElement);
                    }
                    VisitedUrl visitedUrl = new VisitedUrl(nextPageUrl); 
                    visitedUrlRepository.save(Collections.singletonList(visitedUrl));
                }
            }
        }
    }

    @NotNull
    private JsonObject savePages(int page, String longSearchString) {
        JsonObject searchResults = getNextSearchResult(longSearchString + page);
        JsonArray items = searchResults.getAsJsonArray("items");
        List pagePatterns = new ArrayList<>();
        for (JsonElement element : items) {
            String pageUrl = ((JsonObject)element).get("html_url").getAsString();
            if (!PagePatternRepository.getInstance().containsPage(pageUrl)) {
                PagePattern pagePattern = new PagePattern();
                pagePattern.setPageUrl(pageUrl);
                pagePatterns.add(pagePattern);
            }
        }
        PagePatternRepository.getInstance().save(pagePatterns);
        return searchResults;
    }

    private JsonObject getNextSearchResult(String query) {
        try {
            String json = createConnection(query)
                .execute()
                .body();
            return new JsonParser().parse(json).getAsJsonObject();
        } catch (Exception e) {
            throw new LabelizerRuntimeException(e);
        }
    }

    private void getAndSaveJavaRepos(JsonElement nextElement) throws IOException {
        JsonObject nextJsonObject = nextElement.getAsJsonObject();
        JsonElement jsonElementLanguage = nextJsonObject.get(FIELD_LANGUAGE);
        if (!jsonElementLanguage.isJsonNull()) {
            String language = jsonElementLanguage.getAsString();
            if (JAVA.equals(language)) {
                JsonObject owner = nextJsonObject.get("owner").getAsJsonObject();
                String repoFullName = nextJsonObject.getAsJsonPrimitive(FIELD_FULL_NAME).getAsString();
                String reposUrl = owner.get("repos_url").getAsString();
                if (!GithubRepoRepository.getInstance().contains(repoFullName)) {
                    getAndSaveRepos(reposUrl);
                }
            }
        }
    }

    private void getAndSaveRepos(String reposUrl) throws IOException {
        String json = createConnection(reposUrl)
            .execute()
            .body();
        JsonArray repos = new JsonParser().parse(json).getAsJsonArray();
        List githubRepos = new ArrayList<>();
        for (JsonElement repo : repos) {
            JsonObject repoJsonObject = repo.getAsJsonObject();
            JsonElement langJsonElement = repoJsonObject.get(FIELD_LANGUAGE);
            if (!langJsonElement.isJsonNull()) {
                String lang = langJsonElement.getAsString();
                if (JAVA.equals(lang)) {
                    String fullName = repoJsonObject.get(FIELD_FULL_NAME).getAsString();
                    GithubRepo githubRepo = new GithubRepo(fullName);
                    githubRepos.add(githubRepo);
                }
            }
        }
        GithubRepoRepository.getInstance().save(githubRepos);
    }

    private void execute() {
        try {
            ExecutorService executorService = Executors.newFixedThreadPool(MAX_THREADS_25);
            MorphiaCursor cursor = PagePatternRepository.getInstance().getCursorOfEmptyPatterns();
            while (cursor.hasNext()) {
                PagePattern pagePattern = cursor.next();
                createRunnable(pagePattern, executorService);
            }
            executorService.shutdown();
        } catch (Exception e) {
            throw new LabelizerRuntimeException(e);
        }
    }

    private void createRunnable(PagePattern pagePattern, ExecutorService executorService) {
        RunnableCrawler runnableCrawler = new RunnableCrawler(pagePattern);
        executorService.submit(runnableCrawler);
    }

    private JsonObject getNextRepoList(String searchPageUrl) {
        try {
            Connection.Response response = createConnection(searchPageUrl).execute();
            String json = response.body();
            return new JsonParser().parse(json).getAsJsonObject();
        } catch (Exception e) {
            PagePattern pagePattern = new PagePattern();
            pagePattern.setPageUrl(searchPageUrl);
            StringWriter sw = new StringWriter();
            PrintWriter pw = new PrintWriter(sw);
            e.printStackTrace(pw);
            pagePattern.setErrorMessage("Error in the getNextSearchPage method. Message: " + e.getMessage() + ". StackTrace: " + sw.toString());
            PagePatternRepository.getInstance().save(Collections.singletonList(pagePattern));
            throw new LabelizerRuntimeException(e);
        }
    }

    private Connection createConnection(String searchPageUrl) {
        try {
            Thread.sleep(DELAY_BETWEEN_REQUESTS_SECONDS_2 * 1000L + 50);
            String token = "token " + Config.getGithubOauthToken();
            return Jsoup.connect(searchPageUrl)
                .ignoreContentType(true)
                .header("Accept", "application/vnd.github.v3+json")
                .header("Authorization", token);
        } catch (Exception e) {
            throw new LabelizerRuntimeException(e);
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy