org.codelibs.fess.crawler.client.http.PlaywrightClient Maven / Gradle / Ivy
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.client.http;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import javax.annotation.Resource;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.codelibs.core.exception.IORuntimeException;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Tuple4;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.CrawlerContext;
import org.codelibs.fess.crawler.client.AbstractCrawlerClient;
import org.codelibs.fess.crawler.container.CrawlerContainer;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.RequestData.Method;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.filter.UrlFilter;
import org.codelibs.fess.crawler.helper.MimeTypeHelper;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.microsoft.playwright.Browser;
import com.microsoft.playwright.Browser.NewContextOptions;
import com.microsoft.playwright.BrowserContext;
import com.microsoft.playwright.BrowserType;
import com.microsoft.playwright.BrowserType.LaunchOptions;
import com.microsoft.playwright.Download;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import com.microsoft.playwright.Response;
import com.microsoft.playwright.options.Cookie;
import com.microsoft.playwright.options.LoadState;
import com.microsoft.playwright.options.Proxy;
/**
* @author shinsuke
*
*/
public class PlaywrightClient extends AbstractCrawlerClient {
private static final Logger logger = LoggerFactory.getLogger(PlaywrightClient.class);
private static final Object INITIALIZATION_LOCK = new Object();
protected static Tuple4 SHARED_WORKER = null;
protected static final String SHARED_CLIENT = "sharedClient";
protected static final String RENDERED_STATE = "renderedState";
protected static final String IGNORE_HTTPS_ERRORS_PROPERTY = "ignoreHttpsErrors";
protected static final String PROXY_BYPASS_PROPERTY = "proxyBypass";
protected static final String LAST_MODIFIED_FORMAT = "EEE, dd MMM yyyy HH:mm:ss z";
protected Map options = new HashMap<>();
protected String browserName = "chromium";
protected LaunchOptions launchOptions;
protected NewContextOptions newContextOptions;
protected int downloadTimeout = 15; // 15s
protected int closeTimeout = 15; // 15s
protected LoadState renderedState = LoadState.NETWORKIDLE;
protected Tuple4 worker;
@Resource
protected CrawlerContainer crawlerContainer;
@Override
public void init() {
synchronized (INITIALIZATION_LOCK) {
if (worker != null) {
return;
}
if (logger.isDebugEnabled()) {
logger.debug("Initiaizing Playwright...");
}
super.init();
final String renderedStateParam = getInitParameter(RENDERED_STATE, renderedState.name(), String.class);
if (renderedStateParam != null) {
renderedState = LoadState.valueOf(renderedStateParam);
}
final Boolean shared = getInitParameter(SHARED_CLIENT, Boolean.FALSE, Boolean.class);
if (shared) {
if (SHARED_WORKER == null) {
if (logger.isDebugEnabled()) {
logger.debug("Creating a shared Playwright worker...");
}
SHARED_WORKER = createPlaywrightWorker();
}
logger.info("Use a shared Playwright worker.");
worker = SHARED_WORKER;
} else {
worker = createPlaywrightWorker();
}
}
}
protected Tuple4 createPlaywrightWorker() {
// initialize Playwright's browser context
final NewContextOptions newContextOptions = initNewContextOptions();
Playwright playwright = null;
Browser browser = null;
BrowserContext browserContext = null;
Page page = null;
try {
playwright = Playwright.create(new Playwright.CreateOptions().setEnv(options));
browser = getBrowserType(playwright).launch(launchOptions);
browserContext = createAuthenticatedContext(browser, newContextOptions);
page = browserContext.newPage();
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to create Playwright instance.", e);
}
close(playwright, browser, browserContext, page);
throw new CrawlerSystemException("Failed to create PlaywrightClient.", e);
}
return new Tuple4<>(playwright, browser, browserContext, page);
}
@Override
public void close() {
if (worker == null) {
return;
}
try {
close(worker.getValue1(), worker.getValue2(), worker.getValue3(), worker.getValue4());
} finally {
worker = null;
}
}
protected void closeInBackground(final Runnable closer) {
final CountDownLatch latch = new CountDownLatch(1);
try {
final Thread thread = new Thread(() -> {
try {
closer.run();
} catch (final Exception e) {
logger.warn("Failed to close the playwright instance.", e);
}
latch.countDown();
}, "Playwright-Closer");
thread.setDaemon(true);
thread.start();
if (!latch.await(closeTimeout, TimeUnit.SECONDS)) {
logger.warn("The close process is timed out.");
}
} catch (final InterruptedException e) {
logger.warn("Interrupted to wait a process.", e);
} catch (final Exception e) {
logger.warn("Failed to close the playwright instance.", e);
}
}
protected void close(final Playwright playwright, final Browser browser, final BrowserContext context, final Page page) {
closeInBackground(() -> {
if (page != null) {
if (logger.isDebugEnabled()) {
logger.debug("Closing Page...");
}
page.close();
}
});
closeInBackground(() -> {
if (context != null) {
if (logger.isDebugEnabled()) {
logger.debug("Closing BrowserContext...");
}
context.close();
}
});
closeInBackground(() -> {
if (browser != null) {
if (logger.isDebugEnabled()) {
logger.debug("Closing Browser...");
}
browser.close();
}
});
closeInBackground(() -> {
if (playwright != null) {
if (logger.isDebugEnabled()) {
logger.debug("Closing Playwright...");
}
playwright.close();
}
});
}
protected BrowserType getBrowserType(final Playwright playwright) {
if (logger.isDebugEnabled()) {
logger.debug("Create {}...", browserName);
}
return switch (browserName) {
case "firefox":
yield playwright.firefox();
case "webkit":
yield playwright.webkit();
case "chromium":
yield playwright.chromium();
default:
throw new CrawlerSystemException("Unknown browser name: " + browserName);
};
}
public void addOption(final String key, final String value) {
options.put(key, value);
}
@Override
public ResponseData execute(final RequestData request) {
if (worker == null) {
init();
}
final String url = request.getUrl();
final Page page = worker.getValue4();
final AtomicReference responseRef = new AtomicReference<>();
final AtomicReference downloadRef = new AtomicReference<>();
synchronized (page) {
try {
page.onResponse(response -> {
if (responseRef.get() == null) {
responseRef.set(response);
}
});
page.onDownload(downloadRef::set);
if (logger.isDebugEnabled()) {
logger.debug("Accessing {}", url);
}
final Response response = page.navigate(url);
page.waitForLoadState(renderedState);
if (logger.isDebugEnabled()) {
logger.debug("Loaded: Base URL: {}, Response URL: {}", url, response.url());
}
return createResponseData(page, request, response, null);
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Waiting for downloaded file: {}", e.getMessage());
}
for (int i = 0; i < downloadTimeout * 10 && (downloadRef.get() == null || responseRef.get() == null); i++) {
try {
page.waitForTimeout(100L);
} catch (final Exception e1) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to wait for page loading.", e1);
}
}
}
final Response response = responseRef.get();
final Download download = downloadRef.get();
if (response != null && download != null) {
if (logger.isDebugEnabled()) {
logger.debug("Downloaded: URL: {}", response.url());
}
return createResponseData(page, request, response, download);
}
throw new CrawlingAccessException("Failed to access " + request.getUrl(), e);
} finally {
resetPage(page);
}
}
}
protected void resetPage(final Page page) {
try {
page.navigate("about:blank");
page.waitForLoadState(LoadState.LOAD);
} catch (final Exception e) {
logger.warn("Could not reset a page.", e);
}
}
protected ResponseData createResponseData(final Page page, final RequestData request, final Response response,
final Download download) {
final ResponseData responseData = new ResponseData();
final String originalUrl = request.getUrl();
final String url = response.url();
if (!originalUrl.equals(url)) {
final CrawlerContext context = CrawlingParameterUtil.getCrawlerContext();
if (context != null) {
final UrlFilter urlFilter = context.getUrlFilter();
if (urlFilter != null && !urlFilter.match(url)) {
if (logger.isDebugEnabled()) {
logger.debug("{} is not a target url:", url);
}
throw new ChildUrlsException(Collections.emptySet(), "#crawledUrlNotTarget");
}
}
logger.info("Crawled URL: {} -> {}", originalUrl, url);
}
responseData.setUrl(url);
responseData.setMethod(request.getMethod().name());
final String charSet = getCharSet(response);
responseData.setCharSet(charSet);
final int statusCode = getStatusCode(response);
responseData.setHttpStatusCode(statusCode);
responseData.setLastModified(getLastModified(response));
response.allHeaders().entrySet().forEach(e -> responseData.addMetaData(e.getKey(), e.getValue()));
if (statusCode > 400) {
responseData.setContentLength(0);
responseData.setResponseBody(new byte[0]);
responseData.setMimeType(getContentType(response));
} else if (download == null) {
final byte[] body = response.body();
final byte[] responseBody = getMimeTypeHelper().map(mimeTypeHelper -> {
final String filename = getFilename(url);
try (final InputStream in = new ByteArrayInputStream(body)) {
final String contentType = mimeTypeHelper.getContentType(in, filename);
responseData.setMimeType(contentType);
if (logger.isDebugEnabled()) {
logger.debug("filename:{} content-type:{}", filename, contentType);
}
if ("text/html".equals(contentType)) {
try {
final String content = page.content();
if (logger.isDebugEnabled()) {
logger.debug("html content: {}", content);
}
return content.getBytes(charSet);
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Could not get a content from page.", e);
}
}
}
} catch (final IOException e) {
logger.warn("Could not read from {}", url, e);
}
return body;
}).orElse(body);
responseData.setContentLength(responseBody.length);
if (Method.HEAD != request.getMethod()) {
responseData.setResponseBody(responseBody);
}
} else {
try {
final File tempFile = File.createTempFile("fess-crawler-playwright-", ".tmp");
download.saveAs(tempFile.toPath());
responseData.setContentLength(tempFile.length());
getMimeTypeHelper().ifPresent(mimeTypeHelper -> {
final String filename = getFilename(url);
try (final InputStream in = new FileInputStream(tempFile)) {
final String contentType = mimeTypeHelper.getContentType(in, filename);
responseData.setMimeType(contentType);
if (logger.isDebugEnabled()) {
logger.debug("filename:{} content-type:{}", filename, contentType);
}
} catch (final IOException e) {
logger.warn("Could not read {}", tempFile.getAbsolutePath(), e);
}
});
responseData.setResponseBody(tempFile, true);
} catch (final IOException e) {
throw new IORuntimeException(e);
} finally {
download.delete();
}
}
return responseData;
}
protected String getFilename(final String url) {
if (StringUtil.isBlank(url)) {
return null;
}
final String[] values = StringUtils.splitPreserveAllTokens(url, '/');
final String value = values[values.length - 1].split("#")[0].split("\\?")[0];
if (StringUtil.isBlank(value)) {
return "index.html";
}
return value;
}
protected Optional getMimeTypeHelper() {
return Optional.ofNullable(crawlerContainer.getComponent("mimeTypeHelper"));
}
protected String getContentType(final Response response) {
final String contentType = response.headerValue("content-type");
if (StringUtil.isNotBlank(contentType)) {
return contentType.split(";")[0].trim();
}
return "text/html";
}
protected Date getLastModified(final Response response) {
return parseDate(response.headerValue("last-modified"));
}
protected Date parseDate(final String value) {
if (StringUtil.isNotBlank(value)) {
try {
final SimpleDateFormat dateFormat = new SimpleDateFormat(LAST_MODIFIED_FORMAT, Locale.ENGLISH);
return dateFormat.parse(value);
} catch (final ParseException e) {
logger.warn("Invalid format: " + value, e);
}
}
return null;
}
protected int getStatusCode(final Response response) {
return response.status();
}
protected String getCharSet(final Response response) {
final String contentType = response.headerValue("content-type");
if (StringUtil.isNotBlank(contentType)) {
final String[] result = StreamUtil.split(contentType, ";").get(stream -> stream.map(s -> {
final String[] values = s.split("=");
if (values.length == 2 && "charset".equalsIgnoreCase(values[0].trim())) {
return values[1].trim();
}
return null;
}).filter(StringUtil::isNotBlank).toArray(n -> new String[n]));
if (result.length > 0) {
return result[0];
}
}
return Constants.UTF_8;
}
/**
* Reads configurations from Web UI & pass it to Playwright Context
*/
protected NewContextOptions initNewContextOptions() {
final NewContextOptions options = newContextOptions != null ? newContextOptions : new NewContextOptions();
// Check whether to skip SSL certificate checking
// Also check ignoreSslCertificate for backward compatibility with HcHttpClient's config
final boolean ignoreHttpsErrors = getInitParameter(IGNORE_HTTPS_ERRORS_PROPERTY, false, Boolean.class);
final boolean ignoreSslCertificate = getInitParameter(HcHttpClient.IGNORE_SSL_CERTIFICATE_PROPERTY, false, Boolean.class);
if (ignoreHttpsErrors || ignoreSslCertificate) {
options.ignoreHTTPSErrors = true;
}
// append existing proxy configuration
final String proxyHost = getInitParameter(HcHttpClient.PROXY_HOST_PROPERTY, null, String.class);
final Integer proxyPort = getInitParameter(HcHttpClient.PROXY_PORT_PROPERTY, null, Integer.class);
final UsernamePasswordCredentials proxyCredentials =
getInitParameter(HcHttpClient.PROXY_CREDENTIALS_PROPERTY, null, UsernamePasswordCredentials.class);
final String proxyBypass = getInitParameter(PROXY_BYPASS_PROPERTY, null, String.class);
if (!StringUtils.isBlank(proxyHost)) {
final String proxyAddress = proxyPort == null ? proxyHost : proxyHost + ":" + proxyPort;
final Proxy proxy = new Proxy(proxyAddress);
if (proxyCredentials != null) {
proxy.setUsername(proxyCredentials.getUserName());
proxy.setPassword(proxyCredentials.getPassword());
}
proxy.setBypass(proxyBypass);
options.setProxy(proxy);
}
return options;
}
/**
* Creates an authenticated Playwright context, by using Fess's built-in HcHttpClient to do authentication,
* then passes its cookies to Playwright.
*/
protected BrowserContext createAuthenticatedContext(final Browser browser, final NewContextOptions newContextOptions) {
final Authentication[] authentications =
getInitParameter(HcHttpClient.AUTHENTICATIONS_PROPERTY, new Authentication[0], Authentication[].class);
if (authentications.length == 0) {
return browser.newContext(newContextOptions);
}
for (final Authentication authentication : authentications) {
if (!StringUtils.equals(authentication.getAuthScheme().getSchemeName(), "form")) {
// Use the first non-form auth credentials to fill the browser's credential prompt
final String username = authentication.getCredentials().getUserPrincipal().getName();
final String password = authentication.getCredentials().getPassword();
newContextOptions.setHttpCredentials(username, password);
break;
}
}
final BrowserContext playwrightContext = browser.newContext(newContextOptions);
try (final var fessHttpClient = new HcHttpClient()) {
fessHttpClient.setInitParameterMap(initParamMap);
fessHttpClient.init();
final List fessCookies = fessHttpClient.cookieStore.getCookies();
final List playwrightCookies = fessCookies.stream().map(apacheCookie -> {
final var playwrightCookie = new Cookie(apacheCookie.getName(), apacheCookie.getValue());
playwrightCookie.setDomain(apacheCookie.getDomain());
playwrightCookie.setPath(apacheCookie.getPath());
playwrightCookie.setSecure(apacheCookie.isSecure());
// Set expiry time - Apache's cookies use milliseconds as time unit (via Date object),
// while Playwright uses seconds.
final Date cookieExpiryDate = apacheCookie.getExpiryDate();
if (cookieExpiryDate != null) {
playwrightCookie.setExpires(cookieExpiryDate.getTime() / 1000.0);
}
return playwrightCookie;
}).toList();
playwrightContext.addCookies(playwrightCookies);
return playwrightContext;
}
}
public void setLaunchOptions(final LaunchOptions launchOptions) {
this.launchOptions = launchOptions;
}
public void setBrowserName(final String browserName) {
this.browserName = browserName;
}
public void setDownloadTimeout(final int downloadTimeout) {
this.downloadTimeout = downloadTimeout;
}
public void setRenderedState(final LoadState loadState) {
renderedState = loadState;
}
public void setCloseTimeout(final int closeTimeout) {
this.closeTimeout = closeTimeout;
}
public void setNewContextOptions(final NewContextOptions newContextOptions) {
this.newContextOptions = newContextOptions;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy