org.docx4j.convert.in.Doc Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of docx4j Show documentation
docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.
There is a newer version: 6.1.2
Show newest version
/*
 *  Copyright 2007-2008, Plutext Pty Ltd.
 *   
 *  This file is part of docx4j.

    docx4j is licensed under the Apache License, Version 2.0 (the "License"); 
    you may not use this file except in compliance with the License. 

    You may obtain a copy of the License at 

        http://www.apache.org/licenses/LICENSE-2.0 

    Unless required by applicable law or agreed to in writing, software 
    distributed under the License is distributed on an "AS IS" BASIS, 
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    See the License for the specific language governing permissions and 
    limitations under the License.

 */

package org.docx4j.convert.in;

import java.io.FileInputStream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.docx4j.UnitsOfMeasurement;
import org.docx4j.XmlUtils;
import org.docx4j.dml.wordprocessingDrawing.Inline;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;


/**
 * @author jason
 * 
 */
public class Doc {

	private static Logger log = LoggerFactory.getLogger(Doc.class);

	/**
	 * @param in
	 *            doc file
	 * @return new WordprocessingMLPackage containing the results of the
	 *         conversion
	 * @throws Exception
	 */
	public static WordprocessingMLPackage convert(FileInputStream in)
			throws Exception {

		HWPFDocument doc = new HWPFDocument(in);

		WordprocessingMLPackage out = WordprocessingMLPackage.createPackage();

		convert(doc, out);

		return out;
	}

	// public static boolean convert(FileInputStream in, WordprocessingMLPackage
	// out) throws Exception {
	// HWPFDocument doc = new HWPFDocument(in);
	// return convert(doc, out);
	// }

	// public static boolean convert(org.apache.commons.vfs.FileObject in,
	// WordprocessingMLPackage out) throws Exception {

	/**
	 * This method is private, since the fact that conversion is (currently)
	 * performed using POI's HWPF should be encapsulated.
	 * 
	 * @param doc
	 * @param wordMLPackage
	 * @return success or failure
	 */
	private static void convert(HWPFDocument doc,
			WordprocessingMLPackage wordMLPackage) throws Exception {

		// Convert styles
		org.apache.poi.hwpf.model.StyleSheet stylesheet = doc.getStyleSheet();
		// TODO - higher priority
		// At present, a default set of styles are defined in the output
		// document.

		// Convert lists
		org.apache.poi.hwpf.model.ListTables listTables = doc.getListTables();
		// TODO

		// Convert document properties
		org.apache.poi.hwpf.model.DocumentProperties docProps = doc
				.getDocProperties();
		// TODO

		// Convert main document part

		MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();
		org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();

		Range r = doc.getRange();

		for (int x = 0; x < r.numSections(); x++) {
			Section s = r.getSection(x);

			// TODO - convert section

			for (int y = 0; y < s.numParagraphs(); y++) {
				Paragraph p = s.getParagraph(y);

				if (p.isInTable()) {
					Table t = s.getTable(p);
					int cl = numCol(t);

					log.info("Found " + t.numRows() + "x" + cl
							+ " table - TODO - convert");

					handleTable(wordMLPackage, doc,t, stylesheet, documentPart, factory);

					// addTODO(factory, wmlP, "[TABLE " + + t.numRows() + "x" +
					// cl
					// + " - can't convert tables yet]");

					y += t.numParagraphs() - 1;
				}

				else {
					org.docx4j.wml.P paraToAdd = handleP(wordMLPackage, doc,
							p, stylesheet, documentPart, factory);

					documentPart.addObject(paraToAdd);
				}

			}
		}

	}

	private static int ID1 = 1;
	private static int ID2 = 2;

	private static org.docx4j.wml.P handleP(
			WordprocessingMLPackage wordMLPackage, HWPFDocument doc,
			Paragraph p, org.apache.poi.hwpf.model.StyleSheet stylesheet,
			MainDocumentPart documentPart, org.docx4j.wml.ObjectFactory factory) {

		org.docx4j.wml.P wmlP = null;

		if (p.getStyleIndex() > 0) {
			log.debug("Styled paragraph, with index: " + p.getStyleIndex());
			String styleName = stylesheet
					.getStyleDescription(p.getStyleIndex()).getName();

			log.debug(styleName);

			wmlP = documentPart.createStyledParagraphOfText(
					stripSpace(styleName), null);

		} else {
			wmlP = documentPart.createParagraphOfText(null);
		}

		// LineSpacingDescriptor lsd = p.getLineSpacing();
		// if (lsd==null || lsd.isEmpty()) {
		// // do nothing
		// } else {
		// PPr pPr = wmlP.getPPr();
		// if (pPr==null) {
		// pPr = Context.getWmlObjectFactory().createPPr();
		// wmlP.setPPr(pPr);
		// }
		// Spacing spacing =
		// Context.getWmlObjectFactory().createPPrBaseSpacing();
		// spacing.setLine(lsd._dyaLine); // not visible
		// spacing.setLineRule(STLineSpacingRule.AUTO);
		// pPr.setSpacing(spacing);
		// }

		
		for (int z = 0; z < p.numCharacterRuns(); z++) {
			// character run
			CharacterRun run = p.getCharacterRun(z);

			// No character styles defined in there??

			org.docx4j.wml.RPr rPr = null;

			if (run.isBold()) {

				// TODO - HIGH PRIORITY- handle other run properties
				// esp underline, font size
				if (rPr == null) {
					rPr = factory.createRPr();
				}

				org.docx4j.wml.BooleanDefaultTrue boldOn = factory
						.createBooleanDefaultTrue();
				boldOn.setVal(Boolean.TRUE);

				rPr.setB(boldOn);

			}

			//Process image
			if (doc instanceof HWPFDocument
					&& ((HWPFDocument) doc).getPicturesTable().hasPicture(run)) {
				
				Picture picture = doc.getPicturesTable().extractPicture(run,
						true);
				Inline inline;
				try {
					BinaryPartAbstractImage imagePart = BinaryPartAbstractImage
							.createImagePart(wordMLPackage, picture.getContent());

					long cx = UnitsOfMeasurement
							.twipToEMU(Math.round((double) imagePart
									.getImageInfo().getSize().getWidthMpt()
									* ((double) picture
											.getHorizontalScalingFactor() * 0.00001d))) * 2L;
					long cy = UnitsOfMeasurement
							.twipToEMU(Math.round((double) imagePart
									.getImageInfo().getSize().getHeightMpt()
									* ((double) picture
											.getVerticalScalingFactor() * 0.00001d))) * 2L;

					inline = imagePart.createImageInline(null, "", ID1++,
							ID2++, cx, cy, false);

					org.docx4j.wml.R imgrun = factory.createR();
					org.docx4j.wml.Drawing drawing = factory.createDrawing();
					imgrun.getContent().add(drawing);
					drawing.getAnchorOrInline().add(inline);
					wmlP.getContent().add(imgrun);
				} catch (Exception e1) {
					// TODO Auto-generated catch block
					e1.printStackTrace();
				}

			} else {
				// character run text
				String text = run.text();

				// show us the text
				log.debug("Processing: " + text);

				String cleansed = stripNonValidXMLCharacters(text);
				// Necessary to avoid org.xml.sax.SAXParseException: An invalid
				// XML character
				// (Unicode: 0xb) was found in the element content of the
				// document.
				// when trying to open the resulting docx.
				// ie JAXB happily writes (marshals) it, but doesn't want to
				// unmarshall.

				if (!text.equals(cleansed)) {
					log.warn("Cleansed..");
				}

				org.docx4j.wml.Text t = factory.createText();
				t.setValue(cleansed);

				org.docx4j.wml.R wmlRun = factory.createR();

				if (rPr != null) {
					wmlRun.setRPr(rPr);
				}
				wmlRun.getRunContent().add(t);
				wmlP.getParagraphContent().add(wmlRun);
			}
		}

		System.out.println(XmlUtils.marshaltoString(wmlP, true, true));

		return wmlP;

	}

	private static String stripSpace(String in) {

		StringBuffer sb = new StringBuffer();

		for (int i = 0; i < in.length(); i++) {
			if (in.charAt(i) != ' ') {
				sb.append(in.charAt(i));
			}
		}

		return sb.toString();
	}

	private static void addTODO(org.docx4j.wml.ObjectFactory factory,
			org.docx4j.wml.P wmlP, String message) {

		org.docx4j.wml.Text t = factory.createText();
		t.setValue(message);

		org.docx4j.wml.R wmlRun = factory.createR();
		wmlRun.getRunContent().add(t);

		wmlP.getParagraphContent().add(wmlRun);

	}

	/**
	 * This method ensures that the output String has only valid XML unicode
	 * characters as specified by the XML 1.0 standard. For reference, please
	 * see the
	 * standard. This method will return an empty String if the input is
	 * null or empty.
	 * 
	 * See http://cse-mjmcl.cse.bris.ac.uk/blog/2007/02/14/1171465494443.html
	 * 
	 * @param in
	 *            The String whose non-valid characters we want to remove.
	 * @return The in String, stripped of non-valid characters.
	 */
	public static String stripNonValidXMLCharacters(String in) {
		StringBuffer out = new StringBuffer(); // Used to hold the output.
		char current; // Used to reference the current character.

		if (in == null || ("".equals(in)))
			return ""; // vacancy test.
		for (int i = 0; i < in.length(); i++) {
			current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught
									// here; it should not happen.
			if ((current == 0x9) || (current == 0xA) || (current == 0xD)
					|| ((current >= 0x20) && (current <= 0xD7FF))
					|| ((current >= 0xE000) && (current <= 0xFFFD))
					|| ((current >= 0x10000) && (current <= 0x10FFFF))) {
				out.append(current);
			} else {
				out.append("[#?]");
			}
		}
		return out.toString();
	}

	private static int numCol(Table t) {
		int col = 0;
		for (int i = 0; i < t.numRows(); i++) {
			if (t.getRow(i).numCells() > col)
				col = t.getRow(i).numCells();
		}

		return col;
	}

	private static void handleTable(WordprocessingMLPackage wordMLPackage, HWPFDocument doc,Table t,
			org.apache.poi.hwpf.model.StyleSheet stylesheet,
			MainDocumentPart documentPart, org.docx4j.wml.ObjectFactory factory) {

		org.docx4j.wml.Tbl tbl = factory.createTbl();
		documentPart.addObject(tbl);

		org.docx4j.wml.TblPr tblPr = factory.createTblPr();
		tbl.setTblPr(tblPr);
		// TODO - set tblPr values

		org.docx4j.wml.TblGrid tblGrid = factory.createTblGrid();
		tbl.setTblGrid(tblGrid);
		// TODO - set tblGrid values

		for (int i = 0; i < t.numRows(); i++) {
			TableRow tr = t.getRow(i);

			org.docx4j.wml.Tr trOut = factory.createTr();
			tbl.getEGContentRowContent().add(trOut);

			for (int j = 0; j < tr.numCells(); j++) {
				TableCell tc = tr.getCell(j);

				org.docx4j.wml.Tc tcOut = factory.createTc();
				trOut.getEGContentCellContent().add(tcOut);

				// System.out.println("CELL[" + i + "][" + j + "]=" +
				// tc.text());

				for (int y = 0; y < tc.numParagraphs(); y++) {
					Paragraph p = tc.getParagraph(y);

					// Nested tables?
					// if (p.isInTable()) ???

					// TOTO fill up parameters
					org.docx4j.wml.P paraToAdd = handleP(wordMLPackage, doc, p,
							stylesheet, documentPart, factory);

					tcOut.getEGBlockLevelElts().add(paraToAdd);

					log.debug("Added p to tc");
				}

			}
		}
	}

	public static void main(String[] args) throws Exception {

		String localPath = System.getProperty("user.dir") + "/LineSpacing.doc";

		WordprocessingMLPackage out = convert(new FileInputStream(localPath));

		// String outputfilepath = "/home/dev/tmp/test-out.docx";
		//
		// SaveToZipFile saver = new SaveToZipFile(out);
		// saver.save(outputfilepath);
		// log.info("Done - saved docx as " + outputfilepath);

	}
}