All Downloads are FREE. Search and download functionalities are using the official Maven repository.

crawler.rule.xml Maven / Gradle / Ivy

There is a newer version: 14.18.0
Show newest version
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
	"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="fessCrawler">
	<include path="crawler/container.xml" />
	<include path="crawler/transformer.xml" />

	<component name="ruleManager" class="org.codelibs.fess.crawler.rule.impl.RuleManagerImpl" instance="prototype">
		<postConstruct name="addRule">
			<arg>sitemapsRule</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>webHtmlRule</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>webFileRule</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>fsFileRule</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>defaultRule</arg>
		</postConstruct>
	</component>

	<component name="sitemapsRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
		<property name="ruleId">"sitemapsRule"</property>
		<property name="responseProcessor">
			<component class="org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor">
			</component>
		</property>
		<postConstruct name="addRule">
			<arg>"url"</arg>
			<arg>"http[s]?:.*sitemap[^/]*\.xml.*|http[s]?:.*sitemap[^/]*\.gz.*|http[s]?:.*sitemap[^/]*\.txt.*"</arg>
		</postConstruct>
	</component>

	<component name="webHtmlRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
		<property name="ruleId">"webHtmlRule"</property>
		<property name="responseProcessor">
			<component class="org.codelibs.fess.crawler.processor.FessResponseProcessor">
				<property name="transformer">fessXpathTransformer</property>
				<property name="successfulHttpCodes">(int[])[200]</property>
				<property name="notModifiedHttpCodes">(int[])[304]</property>
			</component>
		</property>
		<property name="allRequired">true</property>
		<postConstruct name="addRule">
			<arg>"url"</arg>
			<arg>"http[s]?:.*"</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>"mimeType"</arg>
			<!-- Supported MIME type -->
			<arg>"text/html"</arg>
		</postConstruct>
	</component>

	<component name="webFileRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
		<property name="ruleId">"webFileRule"</property>
		<property name="responseProcessor">
			<component class="org.codelibs.fess.crawler.processor.FessResponseProcessor">
				<property name="transformer">fessFileTransformer</property>
				<property name="successfulHttpCodes">(int[])[200]</property>
				<property name="notModifiedHttpCodes">(int[])[304]</property>
			</component>
		</property>
		<property name="allRequired">true</property>
		<postConstruct name="addRule">
			<arg>"url"</arg>
			<arg>"http[s]?:.*"</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>"mimeType"</arg>
			<!-- Supported MIME type -->
			<arg>
  "(application/xml"
+ "|application/xhtml+xml"
+ "|application/rdf+xml"
+ "|application/pdf"
+ "|application/x-freemind"
+ "|text/xml"
+ "|text/xml-external-parsed-entity)"
			</arg>
		</postConstruct>
	</component>

	<component name="fsFileRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
		<property name="ruleId">"fsFileRule"</property>
		<property name="responseProcessor">
			<component class="org.codelibs.fess.crawler.processor.FessResponseProcessor">
				<property name="transformer">fessFileTransformer</property>
				<property name="successfulHttpCodes">(int[])[200]</property>
				<property name="notModifiedHttpCodes">(int[])[304]</property>
			</component>
		</property>
		<property name="allRequired">true</property>
		<postConstruct name="addRule">
			<arg>"url"</arg>
			<arg>"(file|smb|smb1|ftp):.*"</arg>
		</postConstruct>
		<postConstruct name="addRule">
			<arg>"mimeType"</arg>
			<!-- Supported MIME type -->
			<arg>
  "(application/xml"
+ "|application/xhtml+xml"
+ "|application/rdf+xml"
+ "|application/pdf"
+ "|application/x-freemind"
+ "|application/lha"
+ "|application/x-lha"
+ "|application/x-lha-compressed"
+ "|text/xml"
+ "|text/xml-external-parsed-entity"
+ "|text/html)"
			</arg>
		</postConstruct>
	</component>


	<component name="defaultRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
		<property name="ruleId">"defaultRule"</property>
		<property name="responseProcessor">
			<component class="org.codelibs.fess.crawler.processor.FessResponseProcessor">
				<property name="transformer">fessStandardTransformer</property>
				<property name="successfulHttpCodes">(int[])[200]</property>
				<property name="notModifiedHttpCodes">(int[])[304]</property>
			</component>
		</property>
		<property name="allRequired">true</property>
		<postConstruct name="addRule">
			<arg>"url"</arg>
			<arg>".*"</arg>
		</postConstruct>
	</component>

</components>




© 2015 - 2025 Weber Informatics LLC | Privacy Policy