org.pageseeder.docx.ant.ImportTask Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 1999-2012 weborganic systems pty. ltd.
 */
package org.pageseeder.docx.ant;

import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.Task;
import org.pageseeder.docx.util.Files;
import org.pageseeder.docx.util.XSLT;
import org.pageseeder.docx.util.ZipUtils;
import org.slf4j.Logger;

import javax.xml.transform.Templates;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URLDecoder;
import java.util.*;

/**
 * An ANT task to import a DOCX file as one or more PageSeeder documents
 *
 * @author Christophe Lauret
 * @version 13 February 2013
 */
public final class ImportTask extends Task {

  /**
   * The Word document to import.
   */
  private File source;

  /**
   * Where to create the PageSeeder documents (a directory).
   */
  private File destination;

  /**
   * The name of the working directory
   */
  private File working;

  /**
   * The configuration.
   */
  private File config;

  /**
   * List of parameters specified for the transformation into PSML
   */
  private List params = new ArrayList<>();

  /**
   * The name of the media folder
   */
  private String mediaFolder;

  /**
   * The name of the component folder
   */
  private String componentFolder;

  // Set properties
  // ----------------------------------------------------------------------------------------------

  /**
   * Set the source file (a DOCX file).
   *
   * @param docx The Word document (DOCX) to import.
   */
  public void setSrc(File docx) {
    if (!(docx.exists())) throw new BuildException("the document " + docx.getName()+ " doesn't exist");
    if (docx.isDirectory()) throw new BuildException("the document " + docx.getName() + " can't be a directory");
    String name = docx.getName();
    if (!name.endsWith(".docx") && !name.endsWith(".zip")) {
      log("Word document file should generally end with .docx or .zip - but was "+name);
    }
    this.source = docx;
  }

  /**
   * Set the destination folder where the PageSeeder document(s) should be created.
   *
   * @param destination The destination folder.
   */
  public void setDest(File destination) {
    this.destination = destination;
  }

  /**
   * Set the working folder (optional).
   *
   * @param working The working folder.
   */
  public void setWorking(File working) {
    if (working.exists() && !working.isDirectory()) throw new BuildException("if working folder exists, it must be a directory");
    this.working = working;
  }

  /**
   * Set the configuration file (optional).
   *
   * @param config The configuration file.
   */
  public void setConfig(File config) {
    if (!config.exists() || config.isDirectory()) throw new BuildException("your configuration file must exist and be a file");
    this.config = config;
  }

  /**
   * @param mediaFolder the name of the media folder.
   */
  public void setMediaFolder(String name) {
    this.mediaFolder = name;
  }

  /**
   * @param componentFolder the name of the component folder.
   */
  public void setComponentFolder(String name) {
    this.componentFolder = name;
  }

  /**
   * Create a parameter object and stores it in the list To be used by the XSLT transformation
   */
  public Parameter createParam() {
    Parameter param = new Parameter();
    this.params.add(param);
    return param;
  }

  // Execute
  // ----------------------------------------------------------------------------------------------

  @Override
  public void execute() throws BuildException {
    if (this.source == null)
      throw new BuildException("Source document must be specified using 'src' attribute");

    // Defaulting working directory
    if (this.working == null) {
      this.working = getDefaultWorkingFolder();
    }
    if (!this.working.exists()) {
      this.working.mkdirs();
    }

    // Defaulting destination directory
    if (this.destination == null) {
      this.destination = this.source.getParentFile();
      log("Destination set to source directory "+this.destination.getAbsolutePath()+"");
    }

    // Check parameters
    for (Parameter p : this.params) {
      if (p.getName() == null)
        throw new BuildException("parameters must have a name");
      if (p.getName().startsWith("_"))
        throw new BuildException("parameter names must not start with an underscore");
    }

    // Defaulting config file
    if (this.config == null) {
      // com.pageseeder.ant.docx.xslt.import.wpml-config.xml
      this.config = null; // TODO
      log("Using default wpml configuration for import");
    }

    // The folder and name of the destination
    String sourcename = this.source.getName();
    if (sourcename.toLowerCase().endsWith(".docx")) {
      sourcename = sourcename.substring(0, sourcename.length()-5);
    }
    File folder;
    String filename;
    if (this.destination.getName().endsWith(".psml")) {
      folder = this.destination.getParentFile();
      filename = this.destination.getName().substring(0, this.destination.getName().length()-5);
    } else {
      folder = this.destination;
      filename = sourcename.replaceAll(" ", "_").toLowerCase();
    }

    // Ensure that output folder exists
    if (!folder.exists()) {
      folder.mkdirs();
    }

    // 1. Unzip file
    log("Extracting DOCX: " + this.source.getName());
    File unpacked = new File(this.working, "unpacked");
    unpacked.mkdir();
    ZipUtils.unzip(this.source, unpacked);

    // 2. Sanity check
    log("Checking docx");
    File contentTypes = new File(unpacked, "[Content_Types].xml");
    File relationships = new File(unpacked, "_rels/.rels");
    if (!contentTypes.exists()) throw new BuildException("Not a valid DOCX: unable to find [Content_Types].xml");
    if (!relationships.exists()) throw new BuildException("Not a valid DOCX: unable to find _rels/.rels");

    String componentFolderName = this.componentFolder == null ? "components" : this.componentFolder;
    String mediaFolderName = this.mediaFolder == null ? "images" :
      ("".equals(this.mediaFolder) ? filename + "_files" : this.mediaFolder);

    // Parse templates
    Templates templates = XSLT.getTemplatesFromResource("org/pageseeder/docx/xslt/import.xsl");
    String outuri = folder.toURI().toString();

    // Initiate parameters
    Map parameters = new HashMap<>();
    parameters.put("_rootfolder", unpacked.toURI().toString());
    parameters.put("_outputfolder", outuri);

    parameters.put("_docxfilename", sourcename);
    parameters.put("_mediafoldername", mediaFolderName);
    parameters.put("_componentfoldername", componentFolderName);
    if (this.config != null) {
      parameters.put("_configfileurl", this.config.toURI().toString());
    }

    // Add custom parameters
    for (Parameter p : this.params) {
      parameters.put(p.getName(), p.getValue());
    }

    // 3. Unnest
    log("Unnest");
    Templates unnest = XSLT.getTemplatesFromResource("org/pageseeder/docx/xslt/import-unnest.xsl");
    File document = new File(unpacked, "word/document.xml");
    File newDocument = new File(unpacked, "word/new-document.xml");
    //Map noParameters = Collections.emptyMap();
    Logger logger = AntLogger.newInstance(this);
    XSLT.transform(document, newDocument, unnest, parameters, logger);

    // 3.1 Unnest Endnotes file if it exists
    File endnotes = new File(unpacked, "word/endnotes.xml");
    if(endnotes.canRead()){
    	XSLT.transform(endnotes, new File(unpacked, "word/new-endnotes.xml"), unnest, parameters, logger);
    }
    // 3.2 Unnest Footnotes file if it exists
    File footnotes = new File(unpacked, "word/footnotes.xml");
    if(footnotes.canRead()){
    	XSLT.transform(footnotes, new File(unpacked, "word/new-footnotes.xml"), unnest, parameters, logger);
    }

	// 4. copy the media files
    log("Copy media");
    copyMedia(unpacked, folder, mediaFolderName);

    // 5. Process the files
    log("Process with XSLT (this may take several minutes)");
    XSLT.transform(contentTypes, new File(folder, filename + ".psml"), templates, parameters, logger);
  }

  // Helpers
  // ----------------------------------------------------------------------------------------------

  /**
   * @return the default working folder.
   */
  private static File getDefaultWorkingFolder() {
    String tmp = "psdocx-"+System.currentTimeMillis();
    return new File(System.getProperty("java.io.tmpdir"), tmp);
  }

  /**
   * Copy the images in DOCS to the media folder of the output for the PSML.
   *
   * @param from   The root directory of the unpacked DOCX folder
   * @param to     The root directory of the PSML output
   * @param folder The name of the folder receiving the files
   */
  private static void copyMedia(File from, File to, String folder) {
    File media = new File(from, "word/media");
    if (!media.exists()) return;
    File mediaOut = new File(to, folder);
    try  {
      Files.ensureDirectoryExists(mediaOut);
      for (File m : media.listFiles()) {
        // don't import template images
        if (!m.getName().startsWith(ExportTask.MEDIA_PREFIX)) {
          // decode filename because the image/@src will be decoded by PageSeeder
          // %25 is used for dot because word doesn't like dot encoded or unencoded
          Files.copy(m, new File(mediaOut, URLDecoder.decode(m.getName().replace("%25","."),
              "UTF-8").toLowerCase()));
        }
      }
    } catch (IOException ex) {
      // TODO clean up files
      throw new BuildException(ex);
    }
  }

}