All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.antx.config.resource.util.TextBasedPageParser Maven / Gradle / Ivy

There is a newer version: 1.2
Show newest version
/*
 * Copyright (c) 2002-2012 Alibaba Group Holding Limited.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.alibaba.antx.config.resource.util;

import java.io.PrintWriter;

import com.alibaba.antx.config.resource.Resource;
import com.alibaba.antx.util.ByteArrayOutputStream;
import org.dom4j.Document;
import org.dom4j.io.DOMReader;
import org.dom4j.io.SAXReader;
import org.w3c.tidy.Tidy;

public abstract class TextBasedPageParser implements IndexPageParser {
    private String overridingCharset;

    public TextBasedPageParser() {
    }

    public TextBasedPageParser(String overridingCharset) {
        setOverridingCharset(overridingCharset);
    }

    public void setOverridingCharset(String charset) {
        this.overridingCharset = charset;
    }

    /** 取得xml文档。 */
    protected Document getXmlDocument(Resource resource) {
        String contentType = resource.getContentType();

        if (contentType != null && contentType.startsWith("text/xml")) {
            try {
                return new SAXReader().read(resource.getInputStream());
            } catch (Exception e) {
            }
        }

        return null;
    }

    /** 取得html文档。 */
    protected Document getHtmlDocument(Resource resource) {
        String contentType = resource.getContentType();

        if (contentType != null && contentType.startsWith("text/html")) {
            try {
                Tidy tidy = new Tidy();
                tidy.setQuiet(true);
                tidy.setXmlOut(true);
                tidy.setErrout(new PrintWriter(new ByteArrayOutputStream()));

                org.w3c.dom.Document dom = tidy.parseDOM(resource.getInputStream(), null);

                return new DOMReader().read(dom);
            } catch (Exception e) {
            }
        }

        return null;
    }

    protected String getCharset(Resource resource) {
        if (overridingCharset != null) {
            return overridingCharset;
        } else if (resource.getCharset() != null) {
            return resource.getCharset();
        } else {
            return "ISO-8859-1";
        }
    }

    /** 根据名字取得item。 */
    protected Item getItem(String name) {
        if (name == null) {
            return null;
        }

        boolean directory = name.endsWith("/");

        if (directory) {
            name = name.substring(0, name.length() - 1);
        }

        if (name.length() == 0 || name.indexOf("/") >= 0 || name.startsWith("?") || name.equals(".")
            || name.equals("..")) {
            return null;
        }

        return new Item(name, directory);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy