
com.jaeksoft.searchlib.util.LinkUtils Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.util;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.client.utils.URLEncodedUtils;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.crawler.web.database.UrlFilterItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlFilterList;
public class LinkUtils {
public final static URL getLink(final URL currentURL, final String srcHref,
final UrlFilterItem[] urlFilterList, final boolean removeFragment) {
if (srcHref == null)
return null;
String href = srcHref.trim();
if (href.length() == 0)
return null;
String fragment = null;
URI u = null;
try {
if (!href.contains("://")) {
URI currentURI = null;
try {
currentURI = currentURL.toURI();
} catch (URISyntaxException e) {
currentURI = new URI(URLEncoder.encode(
currentURL.toString(), "UTF-8"));
}
try {
u = URIUtils.resolve(currentURI, href);
} catch (IllegalArgumentException e) {
href = URLEncoder.encode(href, "UTF-8");
u = URIUtils.resolve(currentURI, href);
}
} else {
try {
u = new URI(href);
} catch (URISyntaxException e) {
u = new URI(URLEncoder.encode(href, "UTF-8"));
}
}
href = UrlFilterList.doReplace(u.getHost(), u.toString(),
urlFilterList);
URI uri = URI.create(href);
uri = uri.normalize();
String p = uri.getPath();
if (p != null)
if (p.contains("/./") || p.contains("/../"))
return null;
if (!removeFragment)
fragment = uri.getRawFragment();
URL finalURL = new URI(uri.getScheme(), uri.getUserInfo(),
uri.getHost(), uri.getPort(), uri.getPath(),
uri.getQuery(), fragment).normalize().toURL();
return finalURL;
} catch (MalformedURLException e) {
Logging.info(e.getMessage());
return null;
} catch (URISyntaxException e) {
Logging.info(e.getMessage());
return null;
} catch (IllegalArgumentException e) {
Logging.info(e.getMessage(), e);
return null;
} catch (UnsupportedEncodingException e) {
Logging.info(e.getMessage(), e);
return null;
}
}
public final static String concatPath(String path1, String path2) {
if (path2 == null)
return path1;
if (path1 == null)
return path2;
StringBuilder sb = new StringBuilder(path1);
if (!path1.endsWith("/") && !path2.startsWith("/"))
sb.append('/');
sb.append(path2);
return sb.toString();
}
public final static String lastPart(String path) {
if (path == null)
return null;
String[] parts = StringUtils.split(path, '/');
if (parts == null)
return path;
if (parts.length == 0)
return path;
return parts[parts.length - 1];
}
public final static String UTF8_URL_Encode(String s)
throws UnsupportedEncodingException {
return URLEncoder.encode(s, "UTF-8").replace("+", "%20");
}
public final static String UTF8_URL_QuietDecode(String s) {
try {
return URLDecoder.decode(s, "UTF-8");
} catch (UnsupportedEncodingException e) {
return s;
}
}
public final static URI newEncodedURI(String u)
throws MalformedURLException, URISyntaxException {
URL tmpUrl = new URL(u);
return new URI(tmpUrl.getProtocol(), tmpUrl.getUserInfo(),
tmpUrl.getHost(), tmpUrl.getPort(), tmpUrl.getPath(),
tmpUrl.getQuery(), tmpUrl.getRef());
}
public final static URL newEncodedURL(String u)
throws MalformedURLException, URISyntaxException {
return newEncodedURI(u).toURL();
}
public final static void main(String[] args) throws MalformedURLException,
UnsupportedEncodingException {
System.out.println(getLink(new URL(
"http://www.example.com/test/in-75?l=75&co=FR&start=20"),
"?l=75&co=FR&start=20", null, false));
System.out.println(getLink(new URL("http://www.example.com/test/"),
"/category/index.jsp?categoryId=4955781&ab=denim & supply",
null, false));
System.out.println(lastPart("/my+folder/"));
System.out.println(lastPart("my folder/"));
System.out.println(lastPart("my folder/my+sub-folder/"));
System.out.println(lastPart("/my+file.png"));
System.out.println(lastPart("my+file.png"));
System.out.println(lastPart("my+folder/my+sub-folder/my+file.png"));
System.out.println(UTF8_URL_Encode("outlook:INBOX/~TEST TEST"));
}
public final static Map getUniqueQueryParameters(
final URI uri, final String charset) {
final Map map = new TreeMap();
final List parameters = URLEncodedUtils.parse(uri,
"UTF-8");
for (NameValuePair parameter : parameters)
map.put(parameter.getName(), parameter.getValue());
return map;
}
public final static URL getURL(String urlString, boolean logError) {
if (StringUtils.isEmpty(urlString))
return null;
try {
return new URL(urlString);
} catch (MalformedURLException e) {
if (logError)
Logging.warn("Malformed URL: " + e);
return null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy