All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.extractor.ExtractorSWF Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.6.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.util.UriUtils;
import org.springframework.beans.factory.annotation.Autowired;

import com.anotherbigidea.flash.interfaces.SWFActions;
import com.anotherbigidea.flash.interfaces.SWFTagTypes;
import com.anotherbigidea.flash.interfaces.SWFTags;
import com.anotherbigidea.flash.readers.ActionParser;
import com.anotherbigidea.flash.readers.SWFReader;
import com.anotherbigidea.flash.readers.TagParser;
import com.anotherbigidea.flash.structs.AlphaTransform;
import com.anotherbigidea.flash.structs.Matrix;
import com.anotherbigidea.flash.writers.SWFActionsImpl;
import com.anotherbigidea.io.InStream;

/**
 * Extracts URIs from SWF (flash/shockwave) files.
 * 
 * To test, here is a link to an swf that has links
 * embedded inside of it: http://www.hitspring.com/index.swf.
 *
 * @author Igor Ranitovic
 */
public class ExtractorSWF extends ContentExtractor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static Logger logger =
        Logger.getLogger(ExtractorSWF.class.getName());

    private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB

    protected static final String JSSTRING = "javascript:";

    /**
     * Javascript extractor to use to process inline javascript. Autowired if
     * available. If null, links will not be extracted from inline javascript.
     */
    transient protected ExtractorJS extractorJS;
    public ExtractorJS getExtractorJS() {
        return extractorJS;
    }
    @Autowired
    public void setExtractorJS(ExtractorJS extractorJS) {
        this.extractorJS = extractorJS;
    }
    
    public ExtractorSWF() {
    }

    
    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        String contentType = uri.getContentType();
        if (contentType == null) {
            return false;
        }
        if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
                && (!uri.toString().toLowerCase().endsWith(".swf"))) {
            return false;
        }
        return true;
    }

    
    @Override
    protected boolean innerExtract(CrawlURI curi) {
        InputStream documentStream = null;
        // Get the SWF file's content stream.
        try {
            documentStream = curi.getRecorder().getContentReplayInputStream();
            if (documentStream == null) {
                return false;
            }

            // Create SWF action that will add discovered URIs to CrawlURI
            // alist(s).
            CrawlUriSWFAction curiAction = new CrawlUriSWFAction(curi,this);

            // Overwrite parsing of specific tags that might have URIs.
            CustomSWFTags customTags = new CustomSWFTags(curiAction);
            // Get a SWFReader instance.
            SWFReader reader =
                new ExtractorSWFReader(new ExtractorTagParser(customTags), documentStream);
            
            reader.readFile();
            numberOfLinksExtracted.addAndGet(curiAction.getLinkCount());
            logger.fine(curi + " has " + curiAction.getLinkCount() + " links.");
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
        } finally {
            try {
                documentStream.close();
            } catch (IOException e) {
                curi.getNonFatalFailures().add(e);
            }
        }


        // Set flag to indicate that link extraction is completed.
        return true;
    }

    class ExtractorSWFReader extends SWFReader 
    {
        public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {
            super(consumer, inputstream);
        }

        public ExtractorSWFReader(SWFTags consumer, InStream instream) {
            super(consumer, instream);
        }    

        /**
         * Override because a corrupt SWF file can cause us to try read lengths
         * that are hundreds of megabytes in size causing us to OOME.
         * 
         * Below is copied from SWFReader parent class.
         */
        public int readOneTag() throws IOException {
            int header = mIn.readUI16();
            int type = header >> 6; // only want the top 10 bits
            int length = header & 0x3F; // only want the bottom 6 bits
            boolean longTag = (length == 0x3F);
            if (longTag) {
                length = (int) mIn.readUI32();
            }
            // Below test added for Heritrix use.
            if (length > MAX_READ_SIZE) {
                // skip to next, rather than throw IOException ending
                // processing
                mIn.skipBytes(length);
                logger.info("oversized SWF tag (type=" + type + ";length="
                        + length + ") skipped");
            } else {
                byte[] contents = mIn.read(length);
                mConsumer.tag(type, longTag, contents);
            }
            return type;
        }
    }
    /**
     * TagParser customized to ignore SWFTags that 
     * will never contain extractable URIs. 
     */
    protected class ExtractorTagParser extends TagParser {

        protected ExtractorTagParser(SWFTagTypes tagtypes) {
            super(tagtypes);
        }

        protected void parseDefineBits(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }

        protected void parseDefineBitsJPEG3(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }

        protected void parseDefineBitsLossless(InStream in, int length, boolean hasAlpha) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }

        protected void parseDefineButtonSound(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in sound
        }

        protected void parseDefineFont(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in font
        }

        protected void parseDefineJPEG2(InStream in, int length) throws IOException {
            // DO NOTHING - no URLs to be found in jpeg
        }

        protected void parseDefineJPEGTables(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in jpeg
        }

        protected void parseDefineShape(int type, InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in shape
        }

        protected void parseDefineSound(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in sound
        }

        protected void parseFontInfo(InStream in, int length, boolean isFI2) throws IOException {
            // DO NOTHING - no URLs to be found in font info
        }

        protected void parseDefineFont2(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }
        
        // heritrix: Overridden to use our TagParser and SWFReader. The rest of
        // the code is the same.
        @Override
        protected void parseDefineSprite(InStream in) throws IOException {
            int id = in.readUI16();
            in.readUI16(); // frame count

            SWFTagTypes sstt = mTagtypes.tagDefineSprite(id);

            if (sstt == null)
                return;

            // heritrix: only these two lines differ from
            // super.parseDefineSprite()
            TagParser parser = new ExtractorTagParser(sstt);
            SWFReader reader = new SWFReader(parser, in);

            reader.readTags();
        }
        
        // Overridden to read 32 bit clip event flags when flash version >= 6.
        // All the rest of the code is copied directly. Fixes HER-1509.
        @Override
        protected void parsePlaceObject2( InStream in ) throws IOException
        {
            boolean hasClipActions    = in.readUBits(1) != 0;
            boolean hasClipDepth      = in.readUBits(1) != 0;
            boolean hasName           = in.readUBits(1) != 0;
            boolean hasRatio          = in.readUBits(1) != 0;
            boolean hasColorTransform = in.readUBits(1) != 0;
            boolean hasMatrix         = in.readUBits(1) != 0;
            boolean hasCharacter      = in.readUBits(1) != 0;
            boolean isMove            = in.readUBits(1) != 0;

            int depth = in.readUI16();

            int            charId    = hasCharacter      ? in.readUI16()            : 0;
            Matrix         matrix    = hasMatrix         ? new Matrix( in )         : null;
            AlphaTransform cxform    = hasColorTransform ? new AlphaTransform( in ) : null;
            int            ratio     = hasRatio          ? in.readUI16()            : -1;        
            String         name      = hasName           ? in.readString(mStringEncoding)  : null;  
            int            clipDepth = hasClipDepth      ? in.readUI16()            : 0;

            int clipEventFlags = 0;

            if (hasClipActions) {
                in.readUI16(); // reserved

                // heritrix: flags size changed in swf version 6
                clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();
            }

            SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,
                    depth, charId, matrix, cxform, ratio, name, clipEventFlags);

            if (hasClipActions && actions != null) {
                int flags = 0;

                // heritrix: flags size changed in swf version 6
                while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {
                    in.readUI32(); // length

                    actions.start(flags);
                    ActionParser parser = new ActionParser(actions, mFlashVersion);

                    parser.parse(in);
                }

                actions.done();
            }
        }

    }
    
    
    /**
     * SWF action that handles discovered URIs.
     *
     * @author Igor Ranitovic
     */
    public class CrawlUriSWFAction extends SWFActionsImpl {
        
        protected CrawlURI curi;
        
        private long linkCount;
        private Extractor ext;

        /**
         *
         * @param curi
         */
        public CrawlUriSWFAction(CrawlURI curi, Extractor ext) {
            assert (curi != null) : "CrawlURI should not be null";
            this.curi = curi;
            this.linkCount = 0;
            this.ext = ext;
        }
        
        /**
         * Overwrite handling of discovered URIs.
         *
         * @param url Discovered URL.
         * @param target Discovered target (currently not being used.)
         * @throws IOException
         */
        public void getURL(String url, String target)
        throws IOException {
            if (url.startsWith(JSSTRING)) {
                if (getExtractorJS() != null) {
                    linkCount += getExtractorJS().considerStrings(ext, curi, url);
                }
            } else {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                CrawlURI relToVia = addRelativeToVia(curi, max, url,
                        LinkContext.EMBED_MISC, Hop.EMBED);
                CrawlURI relToBase = addRelativeToBase(curi, max, url,
                        LinkContext.EMBED_MISC, Hop.EMBED);
                addAnnotations(relToVia, relToBase);
                linkCount++;
            }
        }

        protected void addAnnotations(CrawlURI relToVia, CrawlURI relToBase) {
            if (relToVia != null && relToBase != null
                    && relToVia.getUURI().equals(relToBase.getUURI())) {
                relToVia.getAnnotations().add("extractorSWFRelToBoth");
                relToBase.getAnnotations().add("extractorSWFRelToBoth");
            } else {
                if (relToVia != null) {
                    relToVia.getAnnotations().add("extractorSWFRelToVia");
                }
                if (relToBase != null) {
                    relToBase.getAnnotations().add("extractorSWFRelToBase");
                }
            }
        }

        public void considerStringAsUri(String str) throws IOException {
            if (UriUtils.isVeryLikelyUri(str)) {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                CrawlURI relToVia = addRelativeToVia(curi, max, str,
                        LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                CrawlURI relToBase = addRelativeToBase(curi, max, str,
                        LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                addAnnotations(relToVia, relToBase);
                linkCount++;
            }
        }

        public void lookupTable(String[] strings) throws IOException {
            for (String str : strings) {
                considerStringAsUri(str);
            }
        }

        public void push(String value) throws IOException {
            considerStringAsUri(value);
        }
        
        /**
         * @return Total number of links extracted from a swf file.
         */
        public long getLinkCount() {
            return linkCount;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy